stigma-reddit/dataset/preliminary_best_interpolate.R

setwd("~/Desktop/cdsc/health literacy/")

best = read.csv("Best_Data.csv")

# TRIMMING YEARS I DONT NEED

library(help = "datasets")

airquality

month = c("5", "9")

trimmed_airqual = airquality[airquality$Month %in% month,]

trimmed_airqual


# APPLYING TO BEST

years = c("2007", "2010", "2013", "2016")

best_interpolated = best[best$Year %in% years,]


# WRITING FUNCTIONS

triple = function(z){
  3*z
}

triple(9)

# FUNCTIONS TO REPLACE NA VECTOR VALUES WITH A GIVEN VALUE

vector = c(5, NA, 6, 7, NA, 9)

change = function(vec) {
  vec[is.na(vec)] = 4
  return(vec)
}

change(vector)

# FUNCTIONS TO REPLACE NA VECTORS THROUGH AN EQUATION

years2 = c(2000, NA, NA, 2003, NA, NA, 2006)

fill2 = function(years) {
  for (i in 2:length(years)) {
    if (is.na(years[i])) {
      years[i] = years[i - 1] + 1
    }
  }
  return(years)
}

fill2(years2)

# CREATING EMPTY ROWS FOR INTERPOLATION

test_years = c(2000, 2003, 2006)
mean_val = c(0.2, -3, 4)

df = data.frame(year = test_years, value = mean_val)

fill = function(df){
  total_years = seq(min(df$year), max(df$year))          # creating a sequence for all years
  expanded_df = data.frame(year = total_years)          # creating new df for all the years
  merged_df = merge(expanded_df, df, by = "year", all.x = TRUE)     # merging the new df with og one by year
  return(merged_df)
}

new_df = fill(df)

# CREATING EMPTY ROWS WITH DOUBLE VALUES

car = c("Honda", "Honda", "BMW", "BMW")
test_years2 = c(2000, 2003, 2000, 2003)
val = c(0.3, 3, -2, -1)

car_df = data.frame(make = car, year = test_years2, value = val)

new_car = fill(car_df)

# CREATING EMPTY ROWS WITH DOUBLE VALUES THAT FILL THE DISEASE VALUE

    # create the loop

fill = function(df) {
  # create the function that creates empty rows

  total_years = seq(min(df$year), max(df$year))       # new sequence to create all years
  expanded_df = data.frame(year = total_years)       # new dataframe with all the years
  unique_makes = unique(df$make)                     # identify the unique values of "reconciled_name"
  expanded_data = data.frame()                       # create an empty dataframe to fill

  # write the loop that fills the diseases into the new rows

  for (make in unique_makes) {                        # for every value in unique makes
    make_data = df[df$make == make, ]                # collect the data from the make vector
    # Replicate the make's data across all the years
    expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector
    expanded_make_data$make = make                  # Assign the make to each row
    expanded_data = rbind(expanded_data, expanded_make_data)  # Combine the rows
  }

  return(expanded_data)
}

    # clarify the data
car = c("Honda", "Honda", "BMW", "BMW")
test_years2 = c(2000, 2003, 2000, 2003)
val = c(0.3, 3, -2, -1)

car_df = data.frame(make = car, year = test_years2, value = val) # test dataframe

new_car = fill(car_df)         # create new dataframe with empty values


# APPLYING TO BEST FUNCTION

expand = function(df){
  total_years =seq(min(df$Year), max(df$Year))
  expanded_df = data.frame(Year = total_years)
  unique_diseases = unique(df$Reconciled_Name)
  expanded_data = data.frame()

  for (disease in unique_diseases){
    disease_data = df[df$Reconciled_Name == disease, ]
    expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE)
    expanded_disease_data$Reconciled_Name = disease
    expanded_data = rbind(expanded_data, expanded_disease_data)
  }

  return(expanded_data)
}

expanded_best = expand(best_interpolated)


# INTERPOLATING

  # create the loop

fill = function(df) {
  # create the function that creates empty rows

  total_years = seq(min(df$year), max(df$year))       # new sequence to create all years
  expanded_df = data.frame(year = total_years)       # new dataframe with all the years
  unique_makes = unique(df$make)                     # identify the unique values
  expanded_data = data.frame()                       # create an empty dataframe to fill

  # write the loop that fills the make into the new rows

  for (make in unique_makes) {                        # for every value in unique makes
    make_data = df[df$make == make, ]                # collect the data from the make vector
    # Replicate the make's data across all the years
    expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE) # merge the df with all the years, with the make data by the year vector
    expanded_make_data$make = make                  # Assign the make to each row
    expanded_data = rbind(expanded_data, expanded_make_data)  # Combine the rows
  }

  # write the loop that interpolates the val value

  # for every 2000 to 2003, 2003 to 2006, and 2006 to 2009
    # take the corresponding value of "value" for 2003 and subtract the value for 2000 then divide by 3 = z
    # take the corresponding value of "value" for 2000 and add z to give us a value for 2001
    # then add z to 2001 to give us the value for 2002


  return(expanded_data)
}

#### testing

# INTERPOLATING FUNCTION ON ITS OWN

fill_na_with_increment = function(df) {
  unique_makes = unique(df$make)            # set the loop through unique makes (already done)
  for (make in unique_makes) {              # each unique value in makes
    make_data = df[df$make == make, ]      # filter the data by make
    known_indices = which(!is.na(make_data$value))     # identify the known indexes of "value"

    for (i in 1:(length(known_indices) - 1)) {  # calculate the length for each pair of consecutive known indexes
      start_idx = known_indices[i]                  # set the first index to start_idx
      end_idx = known_indices[i + 1]                # set the last index to end_idx

      start_value = make_data$value[start_idx]  # assign the value of the first index to start_value
      end_value = make_data$value[end_idx]      # assign the value of the last index to end_value

      num_missing = end_idx - start_idx - 1     # calculate how many NA's are missing

      if (num_missing > 0) {                    # if the number of missing NAs > 0
        increment = (end_value - start_value) / (num_missing + 1)    # Determine the increment

        for (j in 1:num_missing) {              # for each value in the number of NAs missing
          make_data$value[start_idx + j] = start_value + increment * j
          # take the known value and + 1 to determine the next year then + (increment times the value of j - which is the iteration the loop is going through at that time)
        }
      }
    }

    df[df$make == make, ] = make_data      # reruns for the next make missing data
  }

  return(df)
}

car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW")
test_years2 = c(2000, 2001, 2002, 2003, 2000, 2001, 2002, 2003)
val = c(0.3, NA, NA, 3, -2, NA, NA, -2.5)

car_df2 = data.frame(make = car, year = test_years2, value = val)

filled_car_df2 = fill_na_with_increment(car_df2)


# COMBINING THE CODE

fill_with_increment = function(df) {
  total_years = seq(min(df$year), max(df$year))
  expanded_df = data.frame(year = total_years)
  unique_makes = unique(df$make)
  expanded_data = data.frame()

  for (make in unique_makes) {
    make_data = df[df$make == make, ]
    expanded_make_data = merge(expanded_df, make_data, by = "year", all.x = TRUE)
    expanded_make_data$make = make
    expanded_data = rbind(expanded_data, expanded_make_data)
  }

  for (make in unique_makes) {
    make_data = expanded_data[expanded_data$make == make, ]
    known_indices = which(!is.na(make_data$value))

    for (i in 1:(length(known_indices) - 1)) {
      start_idx = known_indices[i]
      end_idx = known_indices[i + 1]
      start_value = make_data$value[start_idx]
      end_value = make_data$value[end_idx]
      num_missing = end_idx - start_idx - 1

      if (num_missing > 0) {
        increment = (end_value - start_value) / (num_missing + 1)

        for (j in 1:num_missing) {
          make_data$value[start_idx + j] = start_value + increment * j
        }
      }
    }

    expanded_data[expanded_data$make == make, ] = make_data
  }

  return(expanded_data)
}

car = c("Honda", "Honda", "Honda", "Honda", "BMW", "BMW", "BMW", "BMW", "Corvette", "Corvette", "Corvette", "Corvette")
test_years2 = c(2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009, 2000, 2003, 2006, 2009)
val = c(0.3, 3, 3.5, 4, -2.2, -2, -1, 0.4, 2.2, 2.0, 3.5, 5)

car_df4 = data.frame(make = car, year = test_years2, value = val)

filled_car_df4 = fill_with_increment(car_df4)


# APPLYING TO BEST DATA

expand = function(df){
  total_years = seq(min(df$Year), max(df$Year))
  expanded_df = data.frame(Year = total_years)
  unique_diseases = unique(df$Reconciled_Name)
  expanded_data = data.frame()

  for (disease in unique_diseases){
    disease_data = df[df$Reconciled_Name == disease, ]
    expanded_disease_data = merge(expanded_df, disease_data, by = "Year", all.x = TRUE)
    expanded_disease_data$Reconciled_Name = disease
    expanded_data = rbind(expanded_data, expanded_disease_data)
  }

  for (disease in unique_diseases) {
    disease_data = expanded_data[expanded_data$Reconciled_Name == disease, ]
    known_year = which(!is.na(disease_data$mean))
    # sort known_year it by year

    for (i in 1:(length(known_year) - 1)) {
      before_year = known_year[i]
      after_year = known_year[i + 1]
      if (is.na(after_year)){
        print("break")
        break
      }
      start_mean = disease_data$mean[before_year]
      end_mean = disease_data$mean[after_year]
      num_missing = after_year - before_year - 1
      print(after_year)

      if (num_missing > 0) {
        increment = (end_mean - start_mean) / (num_missing + 1)

        for (j in 1:num_missing) {
          disease_data$mean[before_year + j] = start_mean + increment * j
        }
      }
    }

    expanded_data[expanded_data$Reconciled_Name == disease, ] = disease_data
  }

  return(expanded_data)
}

expanded_best = expand(best_interpolated)