diff --git a/.RData b/.RData index ad0122b..078b83e 100644 Binary files a/.RData and b/.RData differ diff --git a/mg-govdoc-cr_24074016.out b/mg-govdoc-cr_24085748.out similarity index 77% rename from mg-govdoc-cr_24074016.out rename to mg-govdoc-cr_24085748.out index e84d9fb..4ae0279 100644 --- a/mg-govdoc-cr_24074016.out +++ b/mg-govdoc-cr_24085748.out @@ -1,17 +1,17 @@ 1. SSH tunnel from your workstation using the following command: - ssh -N -L 8787:n3434:49157 mjilg@klone.hyak.uw.edu + ssh -N -L 8787:n3436:56811 mjilg@klone.hyak.uw.edu and point your web browser to http://localhost:8787 2. log in to RStudio Server using the following credentials: user: mjilg - password: shDpem/m/RHo7HO1CuWG + password: +g73U+bdF4uygmNdsKEt When done using RStudio Server, terminate the job by: 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 2. Issue the following command on the login node: - scancel -f 24074016 + scancel -f 24085748 diff --git a/plots/0207-blup-readability-plot.png b/plots/0207-blup-readability-plot.png new file mode 100644 index 0000000..413649e Binary files /dev/null and b/plots/0207-blup-readability-plot.png differ diff --git a/plots/cr-0207-contributing-blup.png b/plots/cr-0207-contributing-blup.png new file mode 100644 index 0000000..5cb8a9f Binary files /dev/null and b/plots/cr-0207-contributing-blup.png differ diff --git a/plots/cr-0207-wc-density.png b/plots/cr-0207-wc-density.png new file mode 100644 index 0000000..d8d2a97 Binary files /dev/null and b/plots/cr-0207-wc-density.png differ diff --git a/plots/mem_presentation.R b/plots/mem_presentation.R index f9ab6de..e4ea54b 100644 --- a/plots/mem_presentation.R +++ b/plots/mem_presentation.R @@ -1,16 +1,16 @@ library(tidyverse) library(texreg) -readme_rdd <- readRDS("mlm/models/020325_readme_model.rda") -contrib_rdd <- readRDS("mlm/models/020125_contributing_model.rda") +readme_rdd <- readRDS("mlm/models/020725_readme_model.rda") +contrib_rdd <- readRDS("mlm/models/020725_contributing_model.rda") texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, custom.model.names=c( 'README','CONTRIBUTING'), custom.coef.names=c('(Intercept)', 'Indtroduction', 'Week (Time)', 'Project Age', 'Introduction:Week'), table=FALSE, ci.force = TRUE) -readme_groupings <- read.csv('mlm/data/0203_readme_dweek_ranefs.csv') -contrib_groupings <- read.csv('mlm/data/0201_contributing_dweek_ranefs.csv') +readme_groupings <- read.csv('mlm/data/0207_readme_dweek_ranefs.csv') +contrib_groupings <- read.csv('mlm/data/0207_contributing_dweek_ranefs.csv') subdirColors <- setNames( c('#31449c', '#4a7c85', '#c5db68') @@ -35,7 +35,7 @@ contrib_g <- contrib_groupings |> labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g -#ggsave(filename = "plots/cr-0203-contributing-blup.png", plot = contrib_g, width = 9, height = 9, dpi = 800) +ggsave(filename = "plots/cr-0207-contributing-blup.png", plot = contrib_g, width = 9, height = 9, dpi = 800) texreg(list(readme_commits_, commits_), stars=NULL, digits=3, use.packages=FALSE, diff --git a/plots/text_presentation.R b/plots/text_presentation.R index c213488..17f8a8e 100644 --- a/plots/text_presentation.R +++ b/plots/text_presentation.R @@ -1,22 +1,24 @@ library(tidyverse) -readme_groupings <- read.csv('text_analysis/0203_readme_merged_manifest.csv') -contrib_groupings <- read.csv('text_analysis/0203_contributing_merged_manifest.csv') +readme_groupings <- read.csv('text_analysis/0207_readme_merged_manifest.csv') +contrib_groupings <- read.csv('text_analysis/0207_contributing_merged_manifest.csv') contrib_groupings$filename <- contrib_groupings$fvf_filepath readme_groupings$filename <- readme_groupings$fvf_filepath -readme_textstat <- read.csv('text_analysis/020325_README_readability.csv') -contributing_textstat <- read.csv('text_analysis/020125_CONTRIBUTING_readability.csv') +readme_textstat <- read.csv('text_analysis/020725_README_readability.csv') +contributing_textstat <- read.csv('text_analysis/020725_CONTRIBUTING_readability.csv') doctypeColors <- - setNames( c('#5da2d8', '#c7756a') - , c("CONTRIBUTING", "README")) + setNames( c('#c7756a','#5da2d8') + , c("README","CONTRIBUTING")) readme_textstat$type = "README" contributing_textstat$type = "CONTRIBUTING" -all_df = rbind(readme_textstat, contributing_textstat) +all_df = rbind(contributing_textstat,readme_textstat) +all_df$type <- factor(all_df$type, levels = c("CONTRIBUTING", "README")) + length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+ scale_fill_manual(values = doctypeColors) + - xlim(-10, 500) + + xlim(-10, 600) + labs( x = "Word Count", y = "Density Across Documents", @@ -26,10 +28,10 @@ length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + theme(legend.position = "top") length_plot_all -#ggsave(filename = "plots/cr-0203-wc-density.png", plot = length_plot_all, width = 9, height = 9, dpi = 800) +#ggsave(filename = "plots/cr-0207-wc-density.png", plot = length_plot_all, width = 9, height = 9, dpi = 800) -contributing_df <- inner_join(contributing_textstat, contrib_groupings, by="filename") -readme_df <- inner_join(readme_textstat, readme_groupings, by="filename") +contributing_df <- inner_join(contributing_textstat, contrib_groupings, by=c("filename"="new_filepath")) +readme_df <- inner_join(readme_textstat, readme_groupings, by=c("filename"="new_filepath")) subdirColors <- setNames( c('#31449c', '#4a7c85', '#c5db68') @@ -42,7 +44,7 @@ contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, gr labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", - legend.position.inside = c(.90, .90), + legend.position.inside = c(.89, .92), legend.justification = c("right", "top"), legend.direction = "horizontal", legend.margin = margin(6, 6, 6, 6)) @@ -64,7 +66,7 @@ readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.facto labs(x= "Reading Time (s)", y= NULL)+ guides(fill="none", color="none")+ theme_bw() -#readme_reading_time_plot +readme_reading_time_plot readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(ranef_grouping))) + geom_density(aes(fill=as.factor(ranef_grouping)), position="fill") + @@ -73,7 +75,7 @@ readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.fac labs(x= "Flesch Reading Ease", y= "README Density")+ guides(fill="none", color="none")+ theme_bw() -#readme_reading_ease +readme_reading_ease library(gridExtra) grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) diff --git a/topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda b/topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda deleted file mode 100644 index 123d23a..0000000 Binary files a/topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda and /dev/null differ diff --git a/topic-outcome-models/020325_README_commit_topic_model.rda b/topic-outcome-models/020325_README_commit_topic_model.rda deleted file mode 100644 index dd3e05f..0000000 Binary files a/topic-outcome-models/020325_README_commit_topic_model.rda and /dev/null differ diff --git a/topic-outcome-models/020725_CONTRIBUTING_commit_topic_model.rda b/topic-outcome-models/020725_CONTRIBUTING_commit_topic_model.rda new file mode 100644 index 0000000..caee617 Binary files /dev/null and b/topic-outcome-models/020725_CONTRIBUTING_commit_topic_model.rda differ diff --git a/topic-outcome-models/020725_README_commit_topic_model.rda b/topic-outcome-models/020725_README_commit_topic_model.rda new file mode 100644 index 0000000..bc1415c Binary files /dev/null and b/topic-outcome-models/020725_README_commit_topic_model.rda differ diff --git a/topic-outcome-models/contributing_topic_outcome_model.R b/topic-outcome-models/contributing_topic_outcome_model.R index 822c843..720a3f5 100644 --- a/topic-outcome-models/contributing_topic_outcome_model.R +++ b/topic-outcome-models/contributing_topic_outcome_model.R @@ -3,15 +3,20 @@ library(lubridate) library(rdd) library(stringr) -contributing_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" +contributing_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv" contributing_count_df = read.csv(contributing_count_data_filepath, header = TRUE) -contributing_topic_dist_filepath <- "text_analysis/020125_CONTRIBUTING_file_topic_distributions.csv" +contributing_topic_dist_filepath <- "text_analysis/020725_CONTRIBUTING_file_topic_distributions.csv" contributing_topics_df = read.csv(contributing_topic_dist_filepath, header = TRUE) +contributing_merged_manifest <- "text_analysis/0207_contributing_merged_manifest.csv" +contributing_manifest_df <- read.csv(contributing_merged_manifest, header=TRUE) + +merged_df <- inner_join(contributing_manifest_df, contributing_topics_df, by=c("new_filepath"= "filename")) + window_num <- 5 contributing_count_df <- contributing_count_df |> - filter(week_index >= (- window_num) & week_index <= (window_num)) |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) @@ -21,12 +26,7 @@ summed_data <- contributing_count_df |> group_by(project_id) |> summarise_at(vars(commit_count), list(summed_count=sum)) -contributing_topics_df <- contributing_topics_df |> - mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |> - mutate(project_id = ifelse(filename=="_vcr_vcr_CONTRIBUTING.md", "vcr_vcr", project_id)) |> - mutate(project_id = ifelse(filename=="marshmallow-code_marshmallow.git_CONTRIBUTING.rst", "marshmallow-code_marshmallow.git", project_id)) - -merged_df <- inner_join(summed_data, contributing_topics_df, by="project_id") +merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id")) merged_df$logged_commits <- log1p(merged_df$summed_count) library(MASS) @@ -34,4 +34,4 @@ library(MASS) commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4, data=merged_df) qqnorm(residuals(commit_outcome_model)) summary(commit_outcome_model) -saveRDS(commit_outcome_model, "020325_commit_topic_model.rda") +saveRDS(commit_outcome_model, "020725_commit_topic_model.rda") diff --git a/topic-outcome-models/readme_topic_outcome_model.R b/topic-outcome-models/readme_topic_outcome_model.R index 6b4100c..3551d08 100644 --- a/topic-outcome-models/readme_topic_outcome_model.R +++ b/topic-outcome-models/readme_topic_outcome_model.R @@ -3,15 +3,20 @@ library(lubridate) library(rdd) library(stringr) -readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" +readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_README_weekly_count_data.csv" readme_count_df = read.csv(readme_count_data_filepath, header = TRUE) -readme_topic_dist_filepath <- "text_analysis/020325_README_file_topic_distributions.csv" +readme_topic_dist_filepath <- "text_analysis/020725_README_file_topic_distributions.csv" readme_topics_df = read.csv(readme_topic_dist_filepath, header = TRUE) +readme_merged_manifest <- "text_analysis/0207_readme_merged_manifest.csv" +readme_manifest_df <- read.csv(readme_merged_manifest, header=TRUE) + +merged_df <- inner_join(readme_manifest_df, readme_topics_df, by=c("new_filepath"= "filename")) + window_num <- 5 readme_count_df <- readme_count_df |> - filter(week_index >= (- window_num) & week_index <= (window_num)) |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> mutate(scaled_age = scale(age)) |> mutate(scaled_age_at_commit = scale(age_at_commit))|> mutate(log1p_count = log1p(commit_count)) @@ -21,19 +26,15 @@ summed_data <- readme_count_df |> group_by(project_id) |> summarise_at(vars(commit_count), list(summed_count=sum)) -readme_topics_df <- readme_topics_df |> - mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |> - mutate(project_id = ifelse(filename=="jaraco_keyrings.alt_hullabaloo_README.rst", "jaraco_keyrings.alt", project_id)) |> - mutate(project_id = ifelse(filename=="_vcr_vcr_README.md", "vcr_vcr", project_id)) - + #loss of jaraco_keyring, though jaraco keyrings.alt is represented -merged_df <- inner_join(summed_data, readme_topics_df, by="project_id") +merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id")) merged_df$logged_commits <- log1p(merged_df$summed_count) library(MASS) -commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9 + t10, data=merged_df) +commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8, data=merged_df) qqnorm(residuals(commit_outcome_model)) summary(commit_outcome_model) -saveRDS(commit_outcome_model, "020325_CONTRIBUTING_commit_topic_model.rda") +saveRDS(commit_outcome_model, "020725_README_commit_topic_model.rda")