library(dplyr) library(lubridate) library(rdd) library(stringr) readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" readme_count_df = read.csv(readme_count_data_filepath, header = TRUE) readme_topic_dist_filepath <- "text_analysis/020125_README_file_topic_distributions.csv" readme_topics_df = read.csv(readme_topic_dist_filepath, header = TRUE) window_num <- 5 readme_count_df <- readme_count_df |> filter(week_index >= (- window_num) & week_index <= (window_num)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) summed_data <- readme_count_df |> filter(before_after == 1) |> group_by(project_id) |> summarise_at(vars(commit_count), list(summed_count=sum)) readme_topics_df <- readme_topics_df |> mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |> mutate(project_id = ifelse(filename=="jaraco_keyrings.alt_hullabaloo_README.rst", "jaraco_keyrings.alt", project_id)) |> mutate(project_id = ifelse(filename=="_vcr_vcr_README.md", "vcr_vcr", project_id)) filtered_topics <- readme_topics_df |> filter(project_id %in% summed_data$project_id) merged_df <- inner_join(summed_data, readme_topics_df, by="project_id") multiple_mappings <- merged_df %>% group_by(project_id) %>% filter(n() > 1) %>% ungroup() merged_df$logged_commits <- log1p(merged_df$summed_count)