1
0
govdoc-cr-analysis/mlm/contributing_did_prep.R

30 lines
969 B
R
Raw Normal View History

2025-02-02 20:16:42 +00:00
library(dplyr)
library(lubridate)
library(rdd)
contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
df = read.csv(contributing_df_filepath, header = TRUE)
#EDA
var(df$commit_count) # 325.5261
mean(df$commit_count) # 7.743385
median(df$commit_count) # 1
mean(df$age) # 4838.649 days
mean(df$age_at_commit) # 2141.996 days
median(df$age) # 4597 days
median(df$age_at_commit) # 1603 days
# scale and log-transform
df$scaled_age <- scale(df$age)
df$scaled_age_at_commit <- scale(df$age_at_commit)
df$log1p_count <- log1p(df$commit_count)
#getting IK Bandwidth
get_optimal_bandwidth <- function(df){
IKbandwidth(df$week_index, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
}
mean_optimal_bandwidth <- df %>%
group_by(project_id) %>%
summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>%
summarise(mean_optimal_bandwidth = mean(optimal_bandwidth))