updating with outcome models and plots

2025-02-07 16:59:56 -08:00 · 2025-02-07 16:59:56 -08:00 · 212e68a056
commit 212e68a056
parent 4be3640858
13 changed files with 46 additions and 43 deletions
--- a/.RData
+++ b/.RData
--- a/mg-govdoc-cr_24085748.out
+++ b/mg-govdoc-cr_24085748.out
@ -1,17 +1,17 @@
 1. SSH tunnel from your workstation using the following command:

-   ssh -N -L 8787:n3434:49157 mjilg@klone.hyak.uw.edu
+   ssh -N -L 8787:n3436:56811 mjilg@klone.hyak.uw.edu

   and point your web browser to http://localhost:8787

 2. log in to RStudio Server using the following credentials:

   user: mjilg
-   password: shDpem/m/RHo7HO1CuWG
+   password: +g73U+bdF4uygmNdsKEt

 When done using RStudio Server, terminate the job by:

 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:

-      scancel -f 24074016
+      scancel -f 24085748
--- a/plots/0207-blup-readability-plot.png
+++ b/plots/0207-blup-readability-plot.png
--- a/plots/cr-0207-contributing-blup.png
+++ b/plots/cr-0207-contributing-blup.png
--- a/plots/cr-0207-wc-density.png
+++ b/plots/cr-0207-wc-density.png
--- a/plots/mem_presentation.R
+++ b/plots/mem_presentation.R
@ -1,16 +1,16 @@
 library(tidyverse)
 library(texreg)

-readme_rdd <- readRDS("mlm/models/020325_readme_model.rda")
-contrib_rdd <- readRDS("mlm/models/020125_contributing_model.rda")
+readme_rdd <- readRDS("mlm/models/020725_readme_model.rda")
+contrib_rdd <- readRDS("mlm/models/020725_contributing_model.rda")

 texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, 
       custom.model.names=c( 'README','CONTRIBUTING'),
       custom.coef.names=c('(Intercept)', 'Indtroduction', 'Week (Time)', 'Project Age', 'Introduction:Week'), 
       table=FALSE, ci.force = TRUE)

-readme_groupings <- read.csv('mlm/data/0203_readme_dweek_ranefs.csv')
-contrib_groupings <- read.csv('mlm/data/0201_contributing_dweek_ranefs.csv')
+readme_groupings <- read.csv('mlm/data/0207_readme_dweek_ranefs.csv')
+contrib_groupings <- read.csv('mlm/data/0207_contributing_dweek_ranefs.csv')

 subdirColors <-
  setNames( c('#31449c', '#4a7c85', '#c5db68')
@ -35,7 +35,7 @@ contrib_g <- contrib_groupings |>
  labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping")
 contrib_g

-#ggsave(filename = "plots/cr-0203-contributing-blup.png", plot = contrib_g, width = 9, height = 9, dpi = 800)
+ggsave(filename = "plots/cr-0207-contributing-blup.png", plot = contrib_g, width = 9, height = 9, dpi = 800)


 texreg(list(readme_commits_, commits_), stars=NULL, digits=3, use.packages=FALSE, 
--- a/plots/text_presentation.R
+++ b/plots/text_presentation.R
@ -1,22 +1,24 @@
 library(tidyverse)
-readme_groupings <- read.csv('text_analysis/0203_readme_merged_manifest.csv')
-contrib_groupings <- read.csv('text_analysis/0203_contributing_merged_manifest.csv')
+readme_groupings <- read.csv('text_analysis/0207_readme_merged_manifest.csv')
+contrib_groupings <- read.csv('text_analysis/0207_contributing_merged_manifest.csv')
 contrib_groupings$filename <- contrib_groupings$fvf_filepath
 readme_groupings$filename <- readme_groupings$fvf_filepath
-readme_textstat <- read.csv('text_analysis/020325_README_readability.csv')
-contributing_textstat <- read.csv('text_analysis/020125_CONTRIBUTING_readability.csv')
+readme_textstat <- read.csv('text_analysis/020725_README_readability.csv')
+contributing_textstat <- read.csv('text_analysis/020725_CONTRIBUTING_readability.csv')


 doctypeColors <-
-  setNames( c('#5da2d8', '#c7756a')
-            , c("CONTRIBUTING", "README"))
+  setNames( c('#c7756a','#5da2d8')
+            , c("README","CONTRIBUTING"))
 readme_textstat$type = "README"
 contributing_textstat$type = "CONTRIBUTING"
-all_df = rbind(readme_textstat, contributing_textstat)
+all_df = rbind(contributing_textstat,readme_textstat)
+all_df$type <- factor(all_df$type, levels = c("CONTRIBUTING", "README"))
+
 length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + 
  geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+
  scale_fill_manual(values = doctypeColors) + 
-  xlim(-10, 500) +
+  xlim(-10, 600) +
  labs(
    x = "Word Count",
    y = "Density Across Documents",
@ -26,10 +28,10 @@ length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) +
  theme(legend.position = "top") 
 length_plot_all

-#ggsave(filename = "plots/cr-0203-wc-density.png", plot = length_plot_all, width = 9, height = 9, dpi = 800)
+#ggsave(filename = "plots/cr-0207-wc-density.png", plot = length_plot_all, width = 9, height = 9, dpi = 800)

-contributing_df <- inner_join(contributing_textstat, contrib_groupings, by="filename")
-readme_df <- inner_join(readme_textstat, readme_groupings, by="filename")
+contributing_df <- inner_join(contributing_textstat, contrib_groupings, by=c("filename"="new_filepath"))
+readme_df <- inner_join(readme_textstat, readme_groupings, by=c("filename"="new_filepath"))

 subdirColors <-
  setNames( c('#31449c', '#4a7c85', '#c5db68')
@ -42,7 +44,7 @@ contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, gr
  labs(x= NULL, y= NULL, fill="RE Grouping")+ 
  theme_bw() +
  theme(legend.position = "inside", 
-        legend.position.inside = c(.90, .90),
+        legend.position.inside = c(.89, .92),
        legend.justification = c("right", "top"),
        legend.direction = "horizontal",
        legend.margin = margin(6, 6, 6, 6))
@ -64,7 +66,7 @@ readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.facto
  labs(x= "Reading Time (s)", y= NULL)+ 
  guides(fill="none", color="none")+
  theme_bw() 
-#readme_reading_time_plot
+readme_reading_time_plot

 readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(ranef_grouping))) + 
  geom_density(aes(fill=as.factor(ranef_grouping)), position="fill") +
@ -73,7 +75,7 @@ readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.fac
  labs(x= "Flesch Reading Ease", y= "README Density")+ 
  guides(fill="none", color="none")+
  theme_bw() 
-#readme_reading_ease 
+readme_reading_ease 
 library(gridExtra)
 grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2)

--- a/topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda
+++ b/topic-outcome-models/020325_CONTRIBUTING_commit_topic_model.rda
--- a/topic-outcome-models/020325_README_commit_topic_model.rda
+++ b/topic-outcome-models/020325_README_commit_topic_model.rda
--- a/topic-outcome-models/020725_CONTRIBUTING_commit_topic_model.rda
+++ b/topic-outcome-models/020725_CONTRIBUTING_commit_topic_model.rda
--- a/topic-outcome-models/020725_README_commit_topic_model.rda
+++ b/topic-outcome-models/020725_README_commit_topic_model.rda
--- a/topic-outcome-models/contributing_topic_outcome_model.R
+++ b/topic-outcome-models/contributing_topic_outcome_model.R
@ -3,15 +3,20 @@ library(lubridate)
 library(rdd)
 library(stringr)

-contributing_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
+contributing_count_data_filepath <-  "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv"
 contributing_count_df = read.csv(contributing_count_data_filepath, header = TRUE) 

-contributing_topic_dist_filepath <- "text_analysis/020125_CONTRIBUTING_file_topic_distributions.csv"
+contributing_topic_dist_filepath <- "text_analysis/020725_CONTRIBUTING_file_topic_distributions.csv"
 contributing_topics_df = read.csv(contributing_topic_dist_filepath, header = TRUE) 

+contributing_merged_manifest <- "text_analysis/0207_contributing_merged_manifest.csv"
+contributing_manifest_df <- read.csv(contributing_merged_manifest, header=TRUE)
+
+merged_df <- inner_join(contributing_manifest_df, contributing_topics_df, by=c("new_filepath"= "filename"))
+
 window_num <- 5
 contributing_count_df <- contributing_count_df |>
-  filter(week_index >= (- window_num) & week_index <= (window_num)) |>
+  filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
  mutate(scaled_age = scale(age)) |>
  mutate(scaled_age_at_commit = scale(age_at_commit))|>
  mutate(log1p_count = log1p(commit_count))
@ -21,12 +26,7 @@ summed_data <- contributing_count_df |>
  group_by(project_id) |>
  summarise_at(vars(commit_count), list(summed_count=sum))

-contributing_topics_df <- contributing_topics_df |>
-  mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |>
-  mutate(project_id = ifelse(filename=="_vcr_vcr_CONTRIBUTING.md", "vcr_vcr", project_id)) |>
-  mutate(project_id = ifelse(filename=="marshmallow-code_marshmallow.git_CONTRIBUTING.rst", "marshmallow-code_marshmallow.git", project_id))
-
-merged_df <- inner_join(summed_data, contributing_topics_df, by="project_id")
+merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id"))
 merged_df$logged_commits <- log1p(merged_df$summed_count)
  
 library(MASS)
@ -34,4 +34,4 @@ library(MASS)
 commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4, data=merged_df)
 qqnorm(residuals(commit_outcome_model))
 summary(commit_outcome_model)
-saveRDS(commit_outcome_model, "020325_commit_topic_model.rda")
+saveRDS(commit_outcome_model, "020725_commit_topic_model.rda")
--- a/topic-outcome-models/readme_topic_outcome_model.R
+++ b/topic-outcome-models/readme_topic_outcome_model.R
@ -3,15 +3,20 @@ library(lubridate)
 library(rdd)
 library(stringr)

-readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
+readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_README_weekly_count_data.csv"
 readme_count_df = read.csv(readme_count_data_filepath, header = TRUE) 

-readme_topic_dist_filepath <- "text_analysis/020325_README_file_topic_distributions.csv"
+readme_topic_dist_filepath <- "text_analysis/020725_README_file_topic_distributions.csv"
 readme_topics_df = read.csv(readme_topic_dist_filepath, header = TRUE) 

+readme_merged_manifest <- "text_analysis/0207_readme_merged_manifest.csv"
+readme_manifest_df <- read.csv(readme_merged_manifest, header=TRUE)
+
+merged_df <- inner_join(readme_manifest_df, readme_topics_df, by=c("new_filepath"= "filename"))
+
 window_num <- 5
 readme_count_df <- readme_count_df |>
-  filter(week_index >= (- window_num) & week_index <= (window_num)) |>
+  filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
  mutate(scaled_age = scale(age)) |>
  mutate(scaled_age_at_commit = scale(age_at_commit))|>
  mutate(log1p_count = log1p(commit_count))
@ -21,19 +26,15 @@ summed_data <- readme_count_df |>
  group_by(project_id) |>
  summarise_at(vars(commit_count), list(summed_count=sum))

-readme_topics_df <- readme_topics_df |>
-  mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |>
-  mutate(project_id = ifelse(filename=="jaraco_keyrings.alt_hullabaloo_README.rst", "jaraco_keyrings.alt", project_id)) |>
-  mutate(project_id = ifelse(filename=="_vcr_vcr_README.md", "vcr_vcr", project_id)) 
-  
+
 #loss of jaraco_keyring, though jaraco keyrings.alt is represented
-merged_df <- inner_join(summed_data, readme_topics_df, by="project_id")
+merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id"))

 merged_df$logged_commits <- log1p(merged_df$summed_count)

 library(MASS)
-commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9 + t10, data=merged_df)
+commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8, data=merged_df)
 qqnorm(residuals(commit_outcome_model))
 summary(commit_outcome_model)

-saveRDS(commit_outcome_model, "020325_CONTRIBUTING_commit_topic_model.rda")
+saveRDS(commit_outcome_model, "020725_README_commit_topic_model.rda")