316 lines
10 KiB
R
316 lines
10 KiB
R
setwd("~/Desktop/cdsc/health literacy/")
|
|
|
|
best = read.csv("Best_Data.csv")
|
|
|
|
# TRIMMING YEARS I DONT NEED
|
|
|
|
library(help = "datasets")
|
|
|
|
airquality
|
|
|
|
month = c("5", "9")
|
|
|
|
trimmed_airqual = airquality[airquality$Month %in% month,]
|
|
|
|
trimmed_airqual
|
|
|
|
|
|
# APPLYING TO BEST
|
|
|
|
years = c("2007", "2010", "2013", "2016")
|
|
|
|
best_interpolated = best[best$Year %in% years,]
|
|
|
|
|
|
# WRITING FUNCTIONS
|
|
|
|
triple = function(z){
|
|
3*z
|
|
}
|
|
|
|
triple(9)
|
|
|
|
# FUNCTIONS TO REPLACE NA VECTOR VALUES WITH A GIVEN VALUE
|
|
|
|
vector = c(5, NA, 6, 7, NA, 9)
|
|
|
|
change = function(vec) {
|
|
vec[is.na(vec)] = 4
|
|
return(vec)
|
|
}
|
|
|
|
change(vector)
|
|
|
|
# FUNCTIONS TO REPLACE NA VECTORS THROUGH AN EQUATION
|
|
|
|
years2 = c(2000, NA, NA, 2003, NA, NA, 2006)
|
|
|
|
fill2 = function(years) {
|
|
for (i in 2:length(years)) {
|
|
if (is.na(years[i])) {
|
|
years[i] = years[i - 1] + 1
|
|
}
|
|
}
|
|
return(years)
|
|
}
|
|
|
|
fill2(years2)
|
|
|
|
# CREATING EMPTY ROWS FOR INTERPOLATION
|
|
|
|
test_years = c(2000, 2003, 2006)
|
|
mean_val = c(0.2, -3, 4)
|
|
|
|
df = data.frame(year = test_years, value = mean_val)
|
|
|
|
fill = function(df){
|
|
total_years = seq(min(df$year), max(df$year)) # creating a sequence for all years
|
|
expanded_df = data.frame(year = total_years) # creating new df for all the years
|
|
merged_df = merge(expanded_df, df, by = "year", all.x = TRUE) # merging the new df with og one by year
|
|
return(merged_df)
|
|
}
|
|
|
|
new_df = fill(df)
|
|
|
|
# CREATING EMPTY ROWS WITH DOUBLE VALUES
|
|
|
|
car = c("Honda", "Honda", "BMW", "BMW")
|
|
test_years2 = c(2000, 2003, 2000, 2003)
|
|
val = c(0.3, 3, -2, -1)
|
|
|
|
car_df = data.frame(make = car, year = test_years2, value = val)
|
|
|
|
new_car = fill(car_df)
|
|
|
|
# CREATING EMPTY ROWS WITH DOUBLE VALUES THAT FILL THE DISEASE VALUE
|
|
|
|
# create the loop
|
|
|
|
fill = function(df) {
|
|
# create the function that creates empty rows
|
|
|
|
total_years = seq(min(df$year), max(df$year)) # new sequence to create all years
|
|
expanded_df = data.frame(year = total_years) # new dataframe with all the years
|
|
unique_makes = unique(df$make) # identify the unique values of "reconciled_name"
|
|
expanded_data = data.frame() # create an empty dataframe to fill
|
|
|
|
# write the loop that fills the diseases into the new rows
|
|
|
|
for (make in unique_makes) { # for every value in unique makes
|
|
make_data = df[df$make == make, ] # collect the data from the make vector
|
|
# Replicate the make's data across all the years
|
|
expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector
|
|
expanded_make_data$make = make # Assign the make to each row
|
|
expanded_data = rbind(expanded_data, expanded_make_data) # Combine the rows
|
|
}
|
|
|
|
return(expanded_data)
|
|
}
|
|
|
|
# clarify the data
|
|
car = c("Honda", "Honda", "BMW", "BMW")
|
|
test_years2 = c(2000, 2003, 2000, 2003)
|
|
val = c(0.3, 3, -2, -1)
|
|
|
|
car_df = data.frame(make = car, year = test_years2, value = val) # test dataframe
|
|
|
|
new_car = fill(car_df) # create new dataframe with empty values
|
|
|
|
|
|
# APPLYING TO BEST FUNCTION
|
|
|
|
expand = function(df){
|
|
total_years =seq(min(df$Year), max(df$Year))
|
|
expanded_df = data.frame(Year = total_years)
|
|
unique_diseases = unique(df$Reconciled_Name)
|
|
expanded_data = data.frame()
|
|
|
|
for (disease in unique_diseases){
|
|
disease_data = df[df$Reconciled_Name == disease, ]
|
|
expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE)
|
|
expanded_disease_data$Reconciled_Name = disease
|
|
expanded_data = rbind(expanded_data, expanded_disease_data)
|
|
}
|
|
|
|
return(expanded_data)
|
|
}
|
|
|
|
expanded_best = expand(best_interpolated)
|
|
|
|
|
|
# INTERPOLATING
|
|
|
|
# create the loop
|
|
|
|
fill = function(df) {
|
|
# create the function that creates empty rows
|
|
|
|
total_years = seq(min(df$year), max(df$year)) # new sequence to create all years
|
|
expanded_df = data.frame(year = total_years) # new dataframe with all the years
|
|
unique_makes = unique(df$make) # identify the unique values
|
|
expanded_data = data.frame() # create an empty dataframe to fill
|
|
|
|
# write the loop that fills the make into the new rows
|
|
|
|
for (make in unique_makes) { # for every value in unique makes
|
|
make_data = df[df$make == make, ] # collect the data from the make vector
|
|
# Replicate the make's data across all the years
|
|
expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector
|
|
expanded_make_data$make = make # Assign the make to each row
|
|
expanded_data = rbind(expanded_data, expanded_make_data) # Combine the rows
|
|
}
|
|
|
|
# write the loop that interpolates the val value
|
|
|
|
# for every 2000 to 2003, 2003 to 2006, and 2006 to 2009
|
|
# take the corresponding value of "value" for 2003 and subtract the value for 2000 then divide by 3 = z
|
|
# take the corresponding value of "value" for 2000 and add z to give us a value for 2001
|
|
# then add z to 2001 to give us the value for 2002
|
|
|
|
|
|
return(expanded_data)
|
|
}
|
|
|
|
#### testing
|
|
|
|
# INTERPOLATING FUNCTION ON ITS OWN
|
|
|
|
fill_na_with_increment = function(df) {
|
|
unique_makes = unique(df$make) # set the loop through unique makes (already done)
|
|
for (make in unique_makes) { # each unique value in makes
|
|
make_data = df[df$make == make, ] # filter the data by make
|
|
known_indices = which(!is.na(make_data$value)) # identify the known indexes of "value"
|
|
|
|
for (i in 1:(length(known_indices) - 1)) { # calculate the length for each pair of consecutive known indexes
|
|
start_idx = known_indices[i] # set the first index to start_idx
|
|
end_idx = known_indices[i + 1] # set the last index to end_idx
|
|
|
|
start_value = make_data$value[start_idx] # assign the value of the first index to start_value
|
|
end_value = make_data$value[end_idx] # assign the value of the last index to end_value
|
|
|
|
num_missing = end_idx - start_idx - 1 # calculate how many NA's are missing
|
|
|
|
if (num_missing > 0) { # if the number of missing NAs > 0
|
|
increment = (end_value - start_value) / (num_missing + 1) # Determine the increment
|
|
|
|
for (j in 1:num_missing) { # for each value in the number of NAs missing
|
|
make_data$value[start_idx + j] = start_value + increment * j
|
|
# take the known value and + 1 to determine the next year then + (increment times the value of j - which is the iteration the loop is going through at that time)
|
|
}
|
|
}
|
|
}
|
|
|
|
df[df$make == make, ] = make_data # reruns for the next make missing data
|
|
}
|
|
|
|
return(df)
|
|
}
|
|
|
|
car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW")
|
|
test_years2 = c(2000, 2001, 2002, 2003, 2000, 2001, 2002, 2003)
|
|
val = c(0.3, NA, NA, 3, -2, NA, NA, -2.5)
|
|
|
|
car_df2 = data.frame(make = car, year = test_years2, value = val)
|
|
|
|
filled_car_df2 = fill_na_with_increment(car_df2)
|
|
|
|
|
|
# COMBINING THE CODE
|
|
|
|
fill_with_increment = function(df) {
|
|
total_years = seq(min(df$year), max(df$year))
|
|
expanded_df = data.frame(year = total_years)
|
|
unique_makes = unique(df$make)
|
|
expanded_data = data.frame()
|
|
|
|
for (make in unique_makes) {
|
|
make_data = df[df$make == make, ]
|
|
expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE)
|
|
expanded_make_data$make = make
|
|
expanded_data = rbind(expanded_data, expanded_make_data)
|
|
}
|
|
|
|
for (make in unique_makes) {
|
|
make_data = expanded_data[expanded_data$make == make, ]
|
|
known_indices = which(!is.na(make_data$value))
|
|
|
|
for (i in 1:(length(known_indices) - 1)) {
|
|
start_idx = known_indices[i]
|
|
end_idx = known_indices[i + 1]
|
|
start_value = make_data$value[start_idx]
|
|
end_value = make_data$value[end_idx]
|
|
num_missing = end_idx - start_idx - 1
|
|
|
|
if (num_missing > 0) {
|
|
increment = (end_value - start_value) / (num_missing + 1)
|
|
|
|
for (j in 1:num_missing) {
|
|
make_data$value[start_idx + j] = start_value + increment * j
|
|
}
|
|
}
|
|
}
|
|
|
|
expanded_data[expanded_data$make == make, ] = make_data
|
|
}
|
|
|
|
return(expanded_data)
|
|
}
|
|
|
|
car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW", "Corvette", "Corvette", "Corvette", "Corvette")
|
|
test_years2 = c(2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009)
|
|
val = c(0.3, 3, 3.5, 4, -2.2, -2, -1, 0.4, 2.2, 2.0, 3.5, 5)
|
|
|
|
car_df4 = data.frame(make = car, year = test_years2, value = val)
|
|
|
|
filled_car_df4 = fill_with_increment(car_df4)
|
|
|
|
|
|
# APPLYING TO BEST DATA
|
|
|
|
expand = function(df){
|
|
total_years = seq(min(df$Year), max(df$Year))
|
|
expanded_df = data.frame(Year = total_years)
|
|
unique_diseases = unique(df$Reconciled_Name)
|
|
expanded_data = data.frame()
|
|
|
|
for (disease in unique_diseases){
|
|
disease_data = df[df$Reconciled_Name == disease, ]
|
|
expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE)
|
|
expanded_disease_data$Reconciled_Name = disease
|
|
expanded_data = rbind(expanded_data, expanded_disease_data)
|
|
}
|
|
|
|
for (disease in unique_diseases) {
|
|
disease_data = expanded_data[expanded_data$Reconciled_Name == disease, ]
|
|
known_year = which(!is.na(disease_data$mean))
|
|
# sort known_year it by year
|
|
|
|
for (i in 1:(length(known_year) - 1)) {
|
|
before_year = known_year[i]
|
|
after_year = known_year[i + 1]
|
|
if (is.na(after_year)){
|
|
print("break")
|
|
break
|
|
}
|
|
start_mean = disease_data$mean[before_year]
|
|
end_mean = disease_data$mean[after_year]
|
|
num_missing = after_year - before_year - 1
|
|
print(after_year)
|
|
|
|
if (num_missing > 0) {
|
|
increment = (end_mean - start_mean) / (num_missing + 1)
|
|
|
|
for (j in 1:num_missing) {
|
|
disease_data$mean[before_year + j] = start_mean + increment * j
|
|
}
|
|
}
|
|
}
|
|
|
|
expanded_data[expanded_data$Reconciled_Name == disease, ] = disease_data
|
|
}
|
|
|
|
return(expanded_data)
|
|
}
|
|
|
|
expanded_best = expand(best_interpolated)
|