library(tidyverse) library(dplyr) library(lubridate) library(rdd) readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_README_weekly_count_data.csv" df = read.csv(readme_df_filepath, header = TRUE) #filtered_df <- df |> # filter(!project_id == "letsencrypt_letsencrypt") #output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" #write.csv(filtered_df, output_filepath, row.names = FALSE) #EDA var(df$commit_count) # 207.7579 mean(df$commit_count) # 4.31725 median(df$commit_count) # 0 mean(df$age) # 5074.828 mean(df$age_at_commit) # 533.7488 median(df$age) # 4876 median(df$age_at_commit) # 0 days # scale and log-transform df$scaled_age <- scale(df$age) df$scaled_age_at_commit <- scale(df$age_at_commit) df$log1p_count <- log1p(df$commit_count) #getting IK Bandwidth get_optimal_bandwidth <- function(df){ bw <- tryCatch({ IKbandwidth(df$relative_week, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular") }, error = function(e) { NA }) } mean_optimal_bandwidth <- df %>% group_by(project_id) %>% summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>% drop_na(optimal_bandwidth) |> summarise(mean_optimal_bandwidth = mean(optimal_bandwidth)) #Mean Optimal Bandwidth: 5.471781