setwd("~/Desktop/cdsc/health literacy/") best = read.csv("Best_Data.csv") # TRIMMING YEARS I DONT NEED library(help = "datasets") airquality month = c("5", "9") trimmed_airqual = airquality[airquality$Month %in% month,] trimmed_airqual # APPLYING TO BEST years = c("2007", "2010", "2013", "2016") best_interpolated = best[best$Year %in% years,] # WRITING FUNCTIONS triple = function(z){ 3*z } triple(9) # FUNCTIONS TO REPLACE NA VECTOR VALUES WITH A GIVEN VALUE vector = c(5, NA, 6, 7, NA, 9) change = function(vec) { vec[is.na(vec)] = 4 return(vec) } change(vector) # FUNCTIONS TO REPLACE NA VECTORS THROUGH AN EQUATION years2 = c(2000, NA, NA, 2003, NA, NA, 2006) fill2 = function(years) { for (i in 2:length(years)) { if (is.na(years[i])) { years[i] = years[i - 1] + 1 } } return(years) } fill2(years2) # CREATING EMPTY ROWS FOR INTERPOLATION test_years = c(2000, 2003, 2006) mean_val = c(0.2, -3, 4) df = data.frame(year = test_years, value = mean_val) fill = function(df){ total_years = seq(min(df$year), max(df$year)) # creating a sequence for all years expanded_df = data.frame(year = total_years) # creating new df for all the years merged_df = merge(expanded_df, df, by = "year", all.x = TRUE) # merging the new df with og one by year return(merged_df) } new_df = fill(df) # CREATING EMPTY ROWS WITH DOUBLE VALUES car = c("Honda", "Honda", "BMW", "BMW") test_years2 = c(2000, 2003, 2000, 2003) val = c(0.3, 3, -2, -1) car_df = data.frame(make = car, year = test_years2, value = val) new_car = fill(car_df) # CREATING EMPTY ROWS WITH DOUBLE VALUES THAT FILL THE DISEASE VALUE # create the loop fill = function(df) { # create the function that creates empty rows total_years = seq(min(df$year), max(df$year)) # new sequence to create all years expanded_df = data.frame(year = total_years) # new dataframe with all the years unique_makes = unique(df$make) # identify the unique values of "reconciled_name" expanded_data = data.frame() # create an empty dataframe to fill # write the loop that fills the diseases into the new rows for (make in unique_makes) { # for every value in unique makes make_data = df[df$make == make, ] # collect the data from the make vector # Replicate the make's data across all the years expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector expanded_make_data$make = make # Assign the make to each row expanded_data = rbind(expanded_data, expanded_make_data) # Combine the rows } return(expanded_data) } # clarify the data car = c("Honda", "Honda", "BMW", "BMW") test_years2 = c(2000, 2003, 2000, 2003) val = c(0.3, 3, -2, -1) car_df = data.frame(make = car, year = test_years2, value = val) # test dataframe new_car = fill(car_df) # create new dataframe with empty values # APPLYING TO BEST FUNCTION expand = function(df){ total_years =seq(min(df$Year), max(df$Year)) expanded_df = data.frame(Year = total_years) unique_diseases = unique(df$Reconciled_Name) expanded_data = data.frame() for (disease in unique_diseases){ disease_data = df[df$Reconciled_Name == disease, ] expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE) expanded_disease_data$Reconciled_Name = disease expanded_data = rbind(expanded_data, expanded_disease_data) } return(expanded_data) } expanded_best = expand(best_interpolated) # INTERPOLATING # create the loop fill = function(df) { # create the function that creates empty rows total_years = seq(min(df$year), max(df$year)) # new sequence to create all years expanded_df = data.frame(year = total_years) # new dataframe with all the years unique_makes = unique(df$make) # identify the unique values expanded_data = data.frame() # create an empty dataframe to fill # write the loop that fills the make into the new rows for (make in unique_makes) { # for every value in unique makes make_data = df[df$make == make, ] # collect the data from the make vector # Replicate the make's data across all the years expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector expanded_make_data$make = make # Assign the make to each row expanded_data = rbind(expanded_data, expanded_make_data) # Combine the rows } # write the loop that interpolates the val value # for every 2000 to 2003, 2003 to 2006, and 2006 to 2009 # take the corresponding value of "value" for 2003 and subtract the value for 2000 then divide by 3 = z # take the corresponding value of "value" for 2000 and add z to give us a value for 2001 # then add z to 2001 to give us the value for 2002 return(expanded_data) } #### testing # INTERPOLATING FUNCTION ON ITS OWN fill_na_with_increment = function(df) { unique_makes = unique(df$make) # set the loop through unique makes (already done) for (make in unique_makes) { # each unique value in makes make_data = df[df$make == make, ] # filter the data by make known_indices = which(!is.na(make_data$value)) # identify the known indexes of "value" for (i in 1:(length(known_indices) - 1)) { # calculate the length for each pair of consecutive known indexes start_idx = known_indices[i] # set the first index to start_idx end_idx = known_indices[i + 1] # set the last index to end_idx start_value = make_data$value[start_idx] # assign the value of the first index to start_value end_value = make_data$value[end_idx] # assign the value of the last index to end_value num_missing = end_idx - start_idx - 1 # calculate how many NA's are missing if (num_missing > 0) { # if the number of missing NAs > 0 increment = (end_value - start_value) / (num_missing + 1) # Determine the increment for (j in 1:num_missing) { # for each value in the number of NAs missing make_data$value[start_idx + j] = start_value + increment * j # take the known value and + 1 to determine the next year then + (increment times the value of j - which is the iteration the loop is going through at that time) } } } df[df$make == make, ] = make_data # reruns for the next make missing data } return(df) } car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW") test_years2 = c(2000, 2001, 2002, 2003, 2000, 2001, 2002, 2003) val = c(0.3, NA, NA, 3, -2, NA, NA, -2.5) car_df2 = data.frame(make = car, year = test_years2, value = val) filled_car_df2 = fill_na_with_increment(car_df2) # COMBINING THE CODE fill_with_increment = function(df) { total_years = seq(min(df$year), max(df$year)) expanded_df = data.frame(year = total_years) unique_makes = unique(df$make) expanded_data = data.frame() for (make in unique_makes) { make_data = df[df$make == make, ] expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) expanded_make_data$make = make expanded_data = rbind(expanded_data, expanded_make_data) } for (make in unique_makes) { make_data = expanded_data[expanded_data$make == make, ] known_indices = which(!is.na(make_data$value)) for (i in 1:(length(known_indices) - 1)) { start_idx = known_indices[i] end_idx = known_indices[i + 1] start_value = make_data$value[start_idx] end_value = make_data$value[end_idx] num_missing = end_idx - start_idx - 1 if (num_missing > 0) { increment = (end_value - start_value) / (num_missing + 1) for (j in 1:num_missing) { make_data$value[start_idx + j] = start_value + increment * j } } } expanded_data[expanded_data$make == make, ] = make_data } return(expanded_data) } car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW", "Corvette", "Corvette", "Corvette", "Corvette") test_years2 = c(2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009) val = c(0.3, 3, 3.5, 4, -2.2, -2, -1, 0.4, 2.2, 2.0, 3.5, 5) car_df4 = data.frame(make = car, year = test_years2, value = val) filled_car_df4 = fill_with_increment(car_df4) # APPLYING TO BEST DATA expand = function(df){ total_years = seq(min(df$Year), max(df$Year)) expanded_df = data.frame(Year = total_years) unique_diseases = unique(df$Reconciled_Name) expanded_data = data.frame() for (disease in unique_diseases){ disease_data = df[df$Reconciled_Name == disease, ] expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE) expanded_disease_data$Reconciled_Name = disease expanded_data = rbind(expanded_data, expanded_disease_data) } for (disease in unique_diseases) { disease_data = expanded_data[expanded_data$Reconciled_Name == disease, ] known_year = which(!is.na(disease_data$mean)) # sort known_year it by year for (i in 1:(length(known_year) - 1)) { before_year = known_year[i] after_year = known_year[i + 1] if (is.na(after_year)){ print("break") break } start_mean = disease_data$mean[before_year] end_mean = disease_data$mean[after_year] num_missing = after_year - before_year - 1 print(after_year) if (num_missing > 0) { increment = (end_mean - start_mean) / (num_missing + 1) for (j in 1:num_missing) { disease_data$mean[before_year + j] = start_mean + increment * j } } } expanded_data[expanded_data$Reconciled_Name == disease, ] = disease_data } return(expanded_data) } expanded_best = expand(best_interpolated)