diff --git a/020425_contributing_crescendo_model.rda b/020425_contributing_crescendo_model.rda new file mode 100644 index 0000000..08d4ae3 Binary files /dev/null and b/020425_contributing_crescendo_model.rda differ diff --git a/mlm/contributing_crescendo_model.R b/mlm/contributing_crescendo_model.R new file mode 100644 index 0000000..8131c63 --- /dev/null +++ b/mlm/contributing_crescendo_model.R @@ -0,0 +1,24 @@ +library(dplyr) +library(lubridate) +library(rdd) + +contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_df = read.csv(contributing_df_filepath, header = TRUE) + +window_num <- 5 +contributing_df <- contributing_df |> + filter(week_index >= (- window_num) & week_index <= (window_num)) |> + mutate(scaled_age = scale(age)) |> + mutate(scaled_age_at_commit = scale(age_at_commit))|> + mutate(log1p_count = log1p(commit_count)) |> + mutate(C = ifelse(week_index > -4 & week_index <= 0, 1,0)) + +library(lme4) +library(optimx) +library(lattice) + +all_gmodel <- glmer.nb(log1p_count ~ C * week_index + scaled_age + scaled_age_at_commit + (C * week_index | project_id), + control=glmerControl(optimizer="bobyqa", + optCtrl=list(maxfun=2e5)), nAGQ=0, data=contributing_df) +summary(all_gmodel) +saveRDS(all_gmodel, "020425_contributing_crescendo_model.rda") diff --git a/quartile_splits.R b/quartile_splits.R new file mode 100644 index 0000000..f0a104b --- /dev/null +++ b/quartile_splits.R @@ -0,0 +1,80 @@ +contributing_count_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_count_df = read.csv(contributing_count_filepath, header = TRUE) + +window_num <- 5 +contributing_count_df <- contributing_count_df |> + filter(week_index >= (- window_num) & week_index <= (window_num)) |> + mutate(scaled_age = scale(age)) |> + mutate(scaled_age_at_commit = scale(age_at_commit))|> + mutate(log1p_count = log1p(commit_count)) + +quantile(contributing_count_df$age / 365, probs = c(0.25, 0.5, 0.75)) +quantile(contributing_count_df$age_at_commit / 365, probs = c(0.25, 0.5, 0.75)) +aggregate_cccd <- contributing_count_df |> + group_by(project_id) |> + summarize( + avg_weekly_commits = mean(commit_count), + authors_before = sum(new_author_emails[before_after == 0]), + authors_after = sum(new_author_emails[before_after == 1]), + ) +quantile(aggregate_cccd$avg_weekly_commits, probs = c(0.25, 0.5, 0.75)) +quantile(aggregate_cccd$authors_before, probs = c(0.25, 0.5, 0.75)) +quantile(aggregate_cccd$authors_after, probs = c(0.25, 0.5, 0.75)) + +readme_count_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" +readme_count_df = read.csv(readme_count_filepath, header = TRUE) + +window_num <- 5 +readme_count_df <- readme_count_df |> + filter(week_index >= (- window_num) & week_index <= (window_num)) |> + mutate(scaled_age = scale(age)) |> + mutate(scaled_age_at_commit = scale(age_at_commit))|> + mutate(log1p_count = log1p(commit_count)) + +quantile(readme_count_df$age / 365, probs = c(0.25, 0.5, 0.75)) +quantile(readme_count_df$age_at_commit / 365, probs = c(0.25, 0.5, 0.75)) +aggregate_rccd <- readme_count_df |> + group_by(project_id) |> + summarize( + avg_weekly_commits = mean(commit_count), + authors_before = sum(new_author_emails[before_after == 0]), + authors_after = sum(new_author_emails[before_after == 1]), + ) +quantile(aggregate_rccd$avg_weekly_commits, probs = c(0.25, 0.5, 0.75)) +quantile(aggregate_rccd$authors_before, probs = c(0.25, 0.5, 0.75)) +quantile(aggregate_rccd$authors_after, probs = c(0.25, 0.5, 0.75)) + +readme_readability <- read.csv("text_analysis/020325_README_readability.csv", header=TRUE) +median(readme_readability$mcalpine_eflaw) +median(readme_readability$linsear_write_formula) +median(readme_readability$flesch_reading_ease) + +contributing_readability <- read.csv("text_analysis/020125_CONTRIBUTING_readability.csv", header=TRUE) +median(contributing_readability$mcalpine_eflaw) +median(contributing_readability$linsear_write_formula) +median(contributing_readability$flesch_reading_ease) + + +contributing_topic <- read.csv("text_analysis/020125_CONTRIBUTING_file_topic_distributions.csv", header=TRUE) +quantile(contributing_topic$t0, probs = c(0.25, 0.5, 0.75)) +quantile(contributing_topic$t1, probs = c(0.25, 0.5, 0.75)) +quantile(contributing_topic$t2, probs = c(0.25, 0.5, 0.75)) +quantile(contributing_topic$t3, probs = c(0.25, 0.5, 0.75)) +quantile(contributing_topic$t4, probs = c(0.25, 0.5, 0.75)) + +readme_topic <- read.csv("text_analysis/020325_README_file_topic_distributions.csv", header=TRUE) +quantile(readme_topic$t0, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t1, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t2, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t3, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t4, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t5, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t6, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t7, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t8, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t9, probs = c(0.25, 0.5, 0.75)) +quantile(readme_topic$t10, probs = c(0.25, 0.5, 0.75)) + + + + diff --git a/text_analysis/checking_overlap.R b/text_analysis/checking_overlap.R new file mode 100644 index 0000000..156b88a --- /dev/null +++ b/text_analysis/checking_overlap.R @@ -0,0 +1,4 @@ +contributing_manifest <- read.csv("text_analysis/0203_contributing_merged_manifest.csv", header=TRUE) +readme_manifest <- read.csv("text_analysis/0203_readme_merged_manifest.csv", header=TRUE) + +overlap = inner_join(contributing_manifest, readme_manifest, by="repo_id") diff --git a/topic-outcome-models/topic_model_presentation.R b/topic-outcome-models/topic_model_presentation.R new file mode 100644 index 0000000..371cc9c --- /dev/null +++ b/topic-outcome-models/topic_model_presentation.R @@ -0,0 +1,14 @@ +library(texreg) + +readme_rdd <- readRDS("topic-outcome-models/020325_README_commit_topic_model.rda") +contrib_rdd <- readRDS("topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda") + +texreg(readme_rdd, stars=NULL, digits=3, use.packages=FALSE, + custom.model.names=c( 'README'), + custom.coef.names=c('Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10', 'Topic 11'), + table=FALSE, ci.force = TRUE) + +texreg(contrib_rdd, stars=NULL, digits=3, use.packages=FALSE, + custom.model.names=c( 'CONTRIBUITING'), + custom.coef.names=c('Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5'), + table=FALSE, ci.force = TRUE)