govdoc-cr-analysis/topic-outcome-models/readme_topic_outcome_model.R

library(dplyr)
library(lubridate)
library(rdd)
library(stringr)

readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
readme_count_df = read.csv(readme_count_data_filepath, header = TRUE) 

readme_topic_dist_filepath <- "text_analysis/020125_README_file_topic_distributions.csv"
readme_topics_df = read.csv(readme_topic_dist_filepath, header = TRUE) 

window_num <- 5
readme_count_df <- readme_count_df |>
  filter(week_index >= (- window_num) & week_index <= (window_num)) |>
  mutate(scaled_age = scale(age)) |>
  mutate(scaled_age_at_commit = scale(age_at_commit))|>
  mutate(log1p_count = log1p(commit_count))

summed_data <- readme_count_df |>
  filter(before_after == 1) |>
  group_by(project_id) |>
  summarise_at(vars(commit_count), list(summed_count=sum))

readme_topics_df <- readme_topics_df |>
  mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |> 
  mutate(project_id = ifelse(filename=="jaraco_keyrings.alt_hullabaloo_README.rst", "jaraco_keyrings.alt", project_id)) |>
  mutate(project_id = ifelse(filename=="_vcr_vcr_README.md", "vcr_vcr", project_id)) 
  
merged_df <- inner_join(summed_data, readme_topics_df, by="project_id")
merged_df$logged_commits <- log1p(merged_df$summed_count)
contributing_model_done 2025-02-03 22:21:27 +00:00			`library(dplyr)`
			`library(lubridate)`
			`library(rdd)`
			`library(stringr)`

			`readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"`
			`readme_count_df = read.csv(readme_count_data_filepath, header = TRUE)`

			`readme_topic_dist_filepath <- "text_analysis/020125_README_file_topic_distributions.csv"`
			`readme_topics_df = read.csv(readme_topic_dist_filepath, header = TRUE)`

			`window_num <- 5`
			`readme_count_df <- readme_count_df \|>`
			`filter(week_index >= (- window_num) & week_index <= (window_num)) \|>`
			`mutate(scaled_age = scale(age)) \|>`
			`mutate(scaled_age_at_commit = scale(age_at_commit))\|>`
			`mutate(log1p_count = log1p(commit_count))`

			`summed_data <- readme_count_df \|>`
			`filter(before_after == 1) \|>`
			`group_by(project_id) \|>`
			`summarise_at(vars(commit_count), list(summed_count=sum))`

			`readme_topics_df <- readme_topics_df \|>`
			mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) \|>
			`mutate(project_id = ifelse(filename=="jaraco_keyrings.alt_hullabaloo_README.rst", "jaraco_keyrings.alt", project_id)) \|>`
			`mutate(project_id = ifelse(filename=="_vcr_vcr_README.md", "vcr_vcr", project_id))`

			`merged_df <- inner_join(summed_data, readme_topics_df, by="project_id")`
			`merged_df$logged_commits <- log1p(merged_df$summed_count)`