2025-02-02 20:16:42 +00:00
|
|
|
library(tidyverse)
|
|
|
|
library(dplyr)
|
|
|
|
library(lubridate)
|
|
|
|
library(rdd)
|
|
|
|
|
|
|
|
readme_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
|
|
|
|
df = read.csv(readme_df_filepath, header = TRUE)
|
|
|
|
|
2025-02-03 22:21:27 +00:00
|
|
|
#filtered_df <- df |>
|
|
|
|
# filter(!project_id == "letsencrypt_letsencrypt")
|
|
|
|
|
|
|
|
#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
|
|
|
|
#write.csv(filtered_df, output_filepath, row.names = FALSE)
|
|
|
|
|
2025-02-02 20:16:42 +00:00
|
|
|
#EDA
|
|
|
|
var(df$commit_count) # 112.4945
|
|
|
|
mean(df$commit_count) # 2.431342
|
|
|
|
median(df$commit_count) # 0
|
|
|
|
mean(df$age) # 4911.734 days
|
|
|
|
mean(df$age_at_commit) # 197.296 days
|
|
|
|
median(df$age) # 4689 days
|
|
|
|
median(df$age_at_commit) # 0 days
|
|
|
|
|
|
|
|
# scale and log-transform
|
|
|
|
df$scaled_age <- scale(df$age)
|
|
|
|
df$scaled_age_at_commit <- scale(df$age_at_commit)
|
|
|
|
df$log1p_count <- log1p(df$commit_count)
|
|
|
|
|
|
|
|
|
|
|
|
#getting IK Bandwidth
|
|
|
|
get_optimal_bandwidth <- function(df){
|
|
|
|
IKbandwidth(df$week_index, df$log1p_count, cutpoint = 0, verbose = FALSE, kernel = "triangular")
|
|
|
|
}
|
|
|
|
|
|
|
|
mean_optimal_bandwidth <- df %>%
|
|
|
|
group_by(project_id) %>%
|
|
|
|
summarise(optimal_bandwidth = get_optimal_bandwidth(cur_data())) %>%
|
|
|
|
summarise(mean_optimal_bandwidth = mean(optimal_bandwidth))
|
|
|
|
|
|
|
|
#Mean Optimal Bandwidth: 5.44841
|