diff --git a/.RData b/.RData index 078b83e..271cfb4 100644 Binary files a/.RData and b/.RData differ diff --git a/020725_contributing_crescendo_model.rda b/020725_contributing_crescendo_model.rda new file mode 100644 index 0000000..02675c4 Binary files /dev/null and b/020725_contributing_crescendo_model.rda differ diff --git a/intersection.R b/intersection.R new file mode 100644 index 0000000..e69de29 diff --git a/mg-govdoc-cr_24085748.out b/mg-govdoc-cr_24092572.out similarity index 77% rename from mg-govdoc-cr_24085748.out rename to mg-govdoc-cr_24092572.out index 4ae0279..80663aa 100644 --- a/mg-govdoc-cr_24085748.out +++ b/mg-govdoc-cr_24092572.out @@ -1,17 +1,17 @@ 1. SSH tunnel from your workstation using the following command: - ssh -N -L 8787:n3436:56811 mjilg@klone.hyak.uw.edu + ssh -N -L 8787:n3433:51613 mjilg@klone.hyak.uw.edu and point your web browser to http://localhost:8787 2. log in to RStudio Server using the following credentials: user: mjilg - password: +g73U+bdF4uygmNdsKEt + password: ezvnunJrjaDZsvr0nhPR When done using RStudio Server, terminate the job by: 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 2. Issue the following command on the login node: - scancel -f 24085748 + scancel -f 24092572 diff --git a/mlm/contributing_crescendo_model.R b/mlm/contributing_crescendo_model.R index 8131c63..72188a0 100644 --- a/mlm/contributing_crescendo_model.R +++ b/mlm/contributing_crescendo_model.R @@ -2,23 +2,23 @@ library(dplyr) library(lubridate) library(rdd) -contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_df_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv" contributing_df = read.csv(contributing_df_filepath, header = TRUE) window_num <- 5 contributing_df <- contributing_df |> - filter(week_index >= (- window_num) & week_index <= (window_num)) |> + filter(relative_week >= (- window_num) & relative_week <= (0)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) |> - mutate(C = ifelse(week_index > -4 & week_index <= 0, 1,0)) + mutate(C = ifelse(relative_week > -3 & relative_week <= 0, 1,0)) library(lme4) library(optimx) library(lattice) -all_gmodel <- glmer.nb(log1p_count ~ C * week_index + scaled_age + scaled_age_at_commit + (C * week_index | project_id), +all_gmodel <- glmer.nb(log1p_count ~ C * relative_week + scaled_age + scaled_age_at_commit + (C * relative_week | project_id), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), nAGQ=0, data=contributing_df) summary(all_gmodel) -saveRDS(all_gmodel, "020425_contributing_crescendo_model.rda") +saveRDS(all_gmodel, "020725_contributing_crescendo_model.rda") diff --git a/quartile_splits.R b/quartile_splits.R index f0a104b..f5ae1e6 100644 --- a/quartile_splits.R +++ b/quartile_splits.R @@ -1,9 +1,9 @@ -contributing_count_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_count_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv" contributing_count_df = read.csv(contributing_count_filepath, header = TRUE) window_num <- 5 contributing_count_df <- contributing_count_df |> - filter(week_index >= (- window_num) & week_index <= (window_num)) |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) @@ -21,18 +21,19 @@ quantile(aggregate_cccd$avg_weekly_commits, probs = c(0.25, 0.5, 0.75)) quantile(aggregate_cccd$authors_before, probs = c(0.25, 0.5, 0.75)) quantile(aggregate_cccd$authors_after, probs = c(0.25, 0.5, 0.75)) -readme_count_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" +readme_count_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_README_weekly_count_data.csv" readme_count_df = read.csv(readme_count_filepath, header = TRUE) window_num <- 5 readme_count_df <- readme_count_df |> - filter(week_index >= (- window_num) & week_index <= (window_num)) |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) quantile(readme_count_df$age / 365, probs = c(0.25, 0.5, 0.75)) quantile(readme_count_df$age_at_commit / 365, probs = c(0.25, 0.5, 0.75)) +sd(readme_count_df$age_at_commit / 365) aggregate_rccd <- readme_count_df |> group_by(project_id) |> summarize( @@ -44,25 +45,30 @@ quantile(aggregate_rccd$avg_weekly_commits, probs = c(0.25, 0.5, 0.75)) quantile(aggregate_rccd$authors_before, probs = c(0.25, 0.5, 0.75)) quantile(aggregate_rccd$authors_after, probs = c(0.25, 0.5, 0.75)) -readme_readability <- read.csv("text_analysis/020325_README_readability.csv", header=TRUE) +readme_readability <- read.csv("text_analysis/020725_README_readability.csv", header=TRUE) median(readme_readability$mcalpine_eflaw) median(readme_readability$linsear_write_formula) median(readme_readability$flesch_reading_ease) +median(readme_readability$reading_time) +readme_less_than_10 <- readme_readability |> + filter(reading_time < 10) -contributing_readability <- read.csv("text_analysis/020125_CONTRIBUTING_readability.csv", header=TRUE) +contributing_readability <- read.csv("text_analysis/020725_CONTRIBUTING_readability.csv", header=TRUE) median(contributing_readability$mcalpine_eflaw) median(contributing_readability$linsear_write_formula) median(contributing_readability$flesch_reading_ease) +median(contributing_readability$reading_time) +contrib_less_than_10 <- contributing_readability |> + filter(reading_time < 10) - -contributing_topic <- read.csv("text_analysis/020125_CONTRIBUTING_file_topic_distributions.csv", header=TRUE) +contributing_topic <- read.csv("text_analysis/020725_CONTRIBUTING_file_topic_distributions.csv", header=TRUE) quantile(contributing_topic$t0, probs = c(0.25, 0.5, 0.75)) quantile(contributing_topic$t1, probs = c(0.25, 0.5, 0.75)) quantile(contributing_topic$t2, probs = c(0.25, 0.5, 0.75)) quantile(contributing_topic$t3, probs = c(0.25, 0.5, 0.75)) quantile(contributing_topic$t4, probs = c(0.25, 0.5, 0.75)) -readme_topic <- read.csv("text_analysis/020325_README_file_topic_distributions.csv", header=TRUE) +readme_topic <- read.csv("text_analysis/020725_README_file_topic_distributions.csv", header=TRUE) quantile(readme_topic$t0, probs = c(0.25, 0.5, 0.75)) quantile(readme_topic$t1, probs = c(0.25, 0.5, 0.75)) quantile(readme_topic$t2, probs = c(0.25, 0.5, 0.75)) @@ -72,8 +78,6 @@ quantile(readme_topic$t5, probs = c(0.25, 0.5, 0.75)) quantile(readme_topic$t6, probs = c(0.25, 0.5, 0.75)) quantile(readme_topic$t7, probs = c(0.25, 0.5, 0.75)) quantile(readme_topic$t8, probs = c(0.25, 0.5, 0.75)) -quantile(readme_topic$t9, probs = c(0.25, 0.5, 0.75)) -quantile(readme_topic$t10, probs = c(0.25, 0.5, 0.75)) diff --git a/text_analysis/checking_overlap.R b/text_analysis/checking_overlap.R index 156b88a..1617a0c 100644 --- a/text_analysis/checking_overlap.R +++ b/text_analysis/checking_overlap.R @@ -1,4 +1,5 @@ -contributing_manifest <- read.csv("text_analysis/0203_contributing_merged_manifest.csv", header=TRUE) -readme_manifest <- read.csv("text_analysis/0203_readme_merged_manifest.csv", header=TRUE) +contributing_manifest <- read.csv("text_analysis/0207_contributing_merged_manifest.csv", header=TRUE) +readme_manifest <- read.csv("text_analysis/0207_readme_merged_manifest.csv", header=TRUE) overlap = inner_join(contributing_manifest, readme_manifest, by="repo_id") +length(overlap) diff --git a/topic-outcome-models/topic_model_presentation.R b/topic-outcome-models/topic_model_presentation.R index 371cc9c..7bd0983 100644 --- a/topic-outcome-models/topic_model_presentation.R +++ b/topic-outcome-models/topic_model_presentation.R @@ -1,11 +1,11 @@ library(texreg) -readme_rdd <- readRDS("topic-outcome-models/020325_README_commit_topic_model.rda") -contrib_rdd <- readRDS("topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda") +readme_rdd <- readRDS("topic-outcome-models/020725_README_commit_topic_model.rda") +contrib_rdd <- readRDS("topic-outcome-models/020725_CONTRIBUTING_commit_topic_model.rda") texreg(readme_rdd, stars=NULL, digits=3, use.packages=FALSE, custom.model.names=c( 'README'), - custom.coef.names=c('Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10', 'Topic 11'), + custom.coef.names=c('Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9'), table=FALSE, ci.force = TRUE) texreg(contrib_rdd, stars=NULL, digits=3, use.packages=FALSE,