library(dplyr) library(lubridate) library(rdd) library(stringr) contributing_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv" contributing_count_df = read.csv(contributing_count_data_filepath, header = TRUE) contributing_topic_dist_filepath <- "text_analysis/020725_CONTRIBUTING_file_topic_distributions.csv" contributing_topics_df = read.csv(contributing_topic_dist_filepath, header = TRUE) contributing_merged_manifest <- "text_analysis/0207_contributing_merged_manifest.csv" contributing_manifest_df <- read.csv(contributing_merged_manifest, header=TRUE) merged_df <- inner_join(contributing_manifest_df, contributing_topics_df, by=c("new_filepath"= "filename")) window_num <- 5 contributing_count_df <- contributing_count_df |> filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) summed_data <- contributing_count_df |> filter(before_after == 1) |> group_by(project_id) |> summarise_at(vars(commit_count), list(summed_count=sum)) merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id")) merged_df$logged_commits <- log1p(merged_df$summed_count) library(MASS) commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4, data=merged_df) qqnorm(residuals(commit_outcome_model)) summary(commit_outcome_model) saveRDS(commit_outcome_model, "020725_commit_topic_model.rda")