1
0
govdoc-cr-analysis/mlm/readme_did_prep.R
2025-02-07 10:10:31 -08:00

46 lines
1.3 KiB
R

library(tidyverse)
library(dplyr)
library(lubridate)
library(rdd)
readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_README_weekly_count_data.csv"
df = read.csv(readme_df_filepath, header = TRUE)
#filtered_df <- df |>
# filter(!project_id == "letsencrypt_letsencrypt")
#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
#write.csv(filtered_df, output_filepath, row.names = FALSE)
#EDA
var(df$commit_count) # 207.7579
mean(df$commit_count) # 4.31725
median(df$commit_count) # 0
mean(df$age) # 5074.828
mean(df$age_at_commit) # 533.7488
median(df$age) # 4876
median(df$age_at_commit) # 0 days
# scale and log-transform
df$scaled_age <- scale(df$age)
df$scaled_age_at_commit <- scale(df$age_at_commit)
df$log1p_count <- log1p(df$commit_count)
#getting IK Bandwidth
get_optimal_bandwidth <- function(df){
bw <- tryCatch({
IKbandwidth(df$relative_week, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
}, error = function(e) {
NA
})
}
mean_optimal_bandwidth <- df %>%
group_by(project_id) %>%
summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>%
drop_na(optimal_bandwidth) |>
summarise(mean_optimal_bandwidth = mean(optimal_bandwidth))
#Mean Optimal Bandwidth: 5.471781