library(dplyr)
library(lubridate)
library(rdd)
library(tidyr)

contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv"
df = read.csv(contributing_df_filepath, header = TRUE) 

#EDA 
var(df$commit_count) # 349.06
mean(df$commit_count) # 8.371495
median(df$commit_count) # 1
mean(df$age) # 4939.859
mean(df$age_at_commit) # 2286.772 days
median(df$age) #4738 says
median(df$age_at_commit) # 1806 days

# scale and log-transform
df$scaled_age <- scale(df$age)
df$scaled_age_at_commit <- scale(df$age_at_commit)
df$log1p_count <- log1p(df$commit_count)

#getting IK Bandwidth
get_optimal_bandwidth <- function(df){
  bw <- tryCatch({
    IKbandwidth(df$relative_week, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
  }, error = function(e) {
    NA
  })
}

mean_optimal_bandwidth <- df %>%
  group_by(project_id) %>%
  summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>%
  drop_na(optimal_bandwidth) |>
  summarise(mean_optimal_bandwidth = mean(optimal_bandwidth))

#5.676423