1
0
stigma-reddit/dataset/preliminary_best_interpolate.R

316 lines
10 KiB
R

setwd("~/Desktop/cdsc/health literacy/")
best = read.csv("Best_Data.csv")
# TRIMMING YEARS I DONT NEED
library(help = "datasets")
airquality
month = c("5", "9")
trimmed_airqual = airquality[airquality$Month %in% month,]
trimmed_airqual
# APPLYING TO BEST
years = c("2007", "2010", "2013", "2016")
best_interpolated = best[best$Year %in% years,]
# WRITING FUNCTIONS
triple = function(z){
3*z
}
triple(9)
# FUNCTIONS TO REPLACE NA VECTOR VALUES WITH A GIVEN VALUE
vector = c(5, NA, 6, 7, NA, 9)
change = function(vec) {
vec[is.na(vec)] = 4
return(vec)
}
change(vector)
# FUNCTIONS TO REPLACE NA VECTORS THROUGH AN EQUATION
years2 = c(2000, NA, NA, 2003, NA, NA, 2006)
fill2 = function(years) {
for (i in 2:length(years)) {
if (is.na(years[i])) {
years[i] = years[i - 1] + 1
}
}
return(years)
}
fill2(years2)
# CREATING EMPTY ROWS FOR INTERPOLATION
test_years = c(2000, 2003, 2006)
mean_val = c(0.2, -3, 4)
df = data.frame(year = test_years, value = mean_val)
fill = function(df){
total_years = seq(min(df$year), max(df$year)) # creating a sequence for all years
expanded_df = data.frame(year = total_years) # creating new df for all the years
merged_df = merge(expanded_df, df, by = "year", all.x = TRUE) # merging the new df with og one by year
return(merged_df)
}
new_df = fill(df)
# CREATING EMPTY ROWS WITH DOUBLE VALUES
car = c("Honda", "Honda", "BMW", "BMW")
test_years2 = c(2000, 2003, 2000, 2003)
val = c(0.3, 3, -2, -1)
car_df = data.frame(make = car, year = test_years2, value = val)
new_car = fill(car_df)
# CREATING EMPTY ROWS WITH DOUBLE VALUES THAT FILL THE DISEASE VALUE
# create the loop
fill = function(df) {
# create the function that creates empty rows
total_years = seq(min(df$year), max(df$year)) # new sequence to create all years
expanded_df = data.frame(year = total_years) # new dataframe with all the years
unique_makes = unique(df$make) # identify the unique values of "reconciled_name"
expanded_data = data.frame() # create an empty dataframe to fill
# write the loop that fills the diseases into the new rows
for (make in unique_makes) { # for every value in unique makes
make_data = df[df$make == make, ] # collect the data from the make vector
# Replicate the make's data across all the years
expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector
expanded_make_data$make = make # Assign the make to each row
expanded_data = rbind(expanded_data, expanded_make_data) # Combine the rows
}
return(expanded_data)
}
# clarify the data
car = c("Honda", "Honda", "BMW", "BMW")
test_years2 = c(2000, 2003, 2000, 2003)
val = c(0.3, 3, -2, -1)
car_df = data.frame(make = car, year = test_years2, value = val) # test dataframe
new_car = fill(car_df) # create new dataframe with empty values
# APPLYING TO BEST FUNCTION
expand = function(df){
total_years =seq(min(df$Year), max(df$Year))
expanded_df = data.frame(Year = total_years)
unique_diseases = unique(df$Reconciled_Name)
expanded_data = data.frame()
for (disease in unique_diseases){
disease_data = df[df$Reconciled_Name == disease, ]
expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE)
expanded_disease_data$Reconciled_Name = disease
expanded_data = rbind(expanded_data, expanded_disease_data)
}
return(expanded_data)
}
expanded_best = expand(best_interpolated)
# INTERPOLATING
# create the loop
fill = function(df) {
# create the function that creates empty rows
total_years = seq(min(df$year), max(df$year)) # new sequence to create all years
expanded_df = data.frame(year = total_years) # new dataframe with all the years
unique_makes = unique(df$make) # identify the unique values
expanded_data = data.frame() # create an empty dataframe to fill
# write the loop that fills the make into the new rows
for (make in unique_makes) { # for every value in unique makes
make_data = df[df$make == make, ] # collect the data from the make vector
# Replicate the make's data across all the years
expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector
expanded_make_data$make = make # Assign the make to each row
expanded_data = rbind(expanded_data, expanded_make_data) # Combine the rows
}
# write the loop that interpolates the val value
# for every 2000 to 2003, 2003 to 2006, and 2006 to 2009
# take the corresponding value of "value" for 2003 and subtract the value for 2000 then divide by 3 = z
# take the corresponding value of "value" for 2000 and add z to give us a value for 2001
# then add z to 2001 to give us the value for 2002
return(expanded_data)
}
#### testing
# INTERPOLATING FUNCTION ON ITS OWN
fill_na_with_increment = function(df) {
unique_makes = unique(df$make) # set the loop through unique makes (already done)
for (make in unique_makes) { # each unique value in makes
make_data = df[df$make == make, ] # filter the data by make
known_indices = which(!is.na(make_data$value)) # identify the known indexes of "value"
for (i in 1:(length(known_indices) - 1)) { # calculate the length for each pair of consecutive known indexes
start_idx = known_indices[i] # set the first index to start_idx
end_idx = known_indices[i + 1] # set the last index to end_idx
start_value = make_data$value[start_idx] # assign the value of the first index to start_value
end_value = make_data$value[end_idx] # assign the value of the last index to end_value
num_missing = end_idx - start_idx - 1 # calculate how many NA's are missing
if (num_missing > 0) { # if the number of missing NAs > 0
increment = (end_value - start_value) / (num_missing + 1) # Determine the increment
for (j in 1:num_missing) { # for each value in the number of NAs missing
make_data$value[start_idx + j] = start_value + increment * j
# take the known value and + 1 to determine the next year then + (increment times the value of j - which is the iteration the loop is going through at that time)
}
}
}
df[df$make == make, ] = make_data # reruns for the next make missing data
}
return(df)
}
car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW")
test_years2 = c(2000, 2001, 2002, 2003, 2000, 2001, 2002, 2003)
val = c(0.3, NA, NA, 3, -2, NA, NA, -2.5)
car_df2 = data.frame(make = car, year = test_years2, value = val)
filled_car_df2 = fill_na_with_increment(car_df2)
# COMBINING THE CODE
fill_with_increment = function(df) {
total_years = seq(min(df$year), max(df$year))
expanded_df = data.frame(year = total_years)
unique_makes = unique(df$make)
expanded_data = data.frame()
for (make in unique_makes) {
make_data = df[df$make == make, ]
expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE)
expanded_make_data$make = make
expanded_data = rbind(expanded_data, expanded_make_data)
}
for (make in unique_makes) {
make_data = expanded_data[expanded_data$make == make, ]
known_indices = which(!is.na(make_data$value))
for (i in 1:(length(known_indices) - 1)) {
start_idx = known_indices[i]
end_idx = known_indices[i + 1]
start_value = make_data$value[start_idx]
end_value = make_data$value[end_idx]
num_missing = end_idx - start_idx - 1
if (num_missing > 0) {
increment = (end_value - start_value) / (num_missing + 1)
for (j in 1:num_missing) {
make_data$value[start_idx + j] = start_value + increment * j
}
}
}
expanded_data[expanded_data$make == make, ] = make_data
}
return(expanded_data)
}
car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW", "Corvette", "Corvette", "Corvette", "Corvette")
test_years2 = c(2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009)
val = c(0.3, 3, 3.5, 4, -2.2, -2, -1, 0.4, 2.2, 2.0, 3.5, 5)
car_df4 = data.frame(make = car, year = test_years2, value = val)
filled_car_df4 = fill_with_increment(car_df4)
# APPLYING TO BEST DATA
expand = function(df){
total_years = seq(min(df$Year), max(df$Year))
expanded_df = data.frame(Year = total_years)
unique_diseases = unique(df$Reconciled_Name)
expanded_data = data.frame()
for (disease in unique_diseases){
disease_data = df[df$Reconciled_Name == disease, ]
expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE)
expanded_disease_data$Reconciled_Name = disease
expanded_data = rbind(expanded_data, expanded_disease_data)
}
for (disease in unique_diseases) {
disease_data = expanded_data[expanded_data$Reconciled_Name == disease, ]
known_year = which(!is.na(disease_data$mean))
# sort known_year it by year
for (i in 1:(length(known_year) - 1)) {
before_year = known_year[i]
after_year = known_year[i + 1]
if (is.na(after_year)){
print("break")
break
}
start_mean = disease_data$mean[before_year]
end_mean = disease_data$mean[after_year]
num_missing = after_year - before_year - 1
print(after_year)
if (num_missing > 0) {
increment = (end_mean - start_mean) / (num_missing + 1)
for (j in 1:num_missing) {
disease_data$mean[before_year + j] = start_mean + increment * j
}
}
}
expanded_data[expanded_data$Reconciled_Name == disease, ] = disease_data
}
return(expanded_data)
}
expanded_best = expand(best_interpolated)