1
0

updating with outcome models and plots

This commit is contained in:
Matthew Gaughan 2025-02-07 16:59:56 -08:00
parent 4be3640858
commit 212e68a056
13 changed files with 46 additions and 43 deletions

BIN
.RData

Binary file not shown.

View File

@ -1,17 +1,17 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3434:49157 mjilg@klone.hyak.uw.edu
ssh -N -L 8787:n3436:56811 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: shDpem/m/RHo7HO1CuWG
password: +g73U+bdF4uygmNdsKEt
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 24074016
scancel -f 24085748

Binary file not shown.

After

Width:  |  Height:  |  Size: 85 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 428 KiB

View File

@ -1,16 +1,16 @@
library(tidyverse)
library(texreg)
readme_rdd <- readRDS("mlm/models/020325_readme_model.rda")
contrib_rdd <- readRDS("mlm/models/020125_contributing_model.rda")
readme_rdd <- readRDS("mlm/models/020725_readme_model.rda")
contrib_rdd <- readRDS("mlm/models/020725_contributing_model.rda")
texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE,
custom.model.names=c( 'README','CONTRIBUTING'),
custom.coef.names=c('(Intercept)', 'Indtroduction', 'Week (Time)', 'Project Age', 'Introduction:Week'),
table=FALSE, ci.force = TRUE)
readme_groupings <- read.csv('mlm/data/0203_readme_dweek_ranefs.csv')
contrib_groupings <- read.csv('mlm/data/0201_contributing_dweek_ranefs.csv')
readme_groupings <- read.csv('mlm/data/0207_readme_dweek_ranefs.csv')
contrib_groupings <- read.csv('mlm/data/0207_contributing_dweek_ranefs.csv')
subdirColors <-
setNames( c('#31449c', '#4a7c85', '#c5db68')
@ -35,7 +35,7 @@ contrib_g <- contrib_groupings |>
labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping")
contrib_g
#ggsave(filename = "plots/cr-0203-contributing-blup.png", plot = contrib_g, width = 9, height = 9, dpi = 800)
ggsave(filename = "plots/cr-0207-contributing-blup.png", plot = contrib_g, width = 9, height = 9, dpi = 800)
texreg(list(readme_commits_, commits_), stars=NULL, digits=3, use.packages=FALSE,

View File

@ -1,22 +1,24 @@
library(tidyverse)
readme_groupings <- read.csv('text_analysis/0203_readme_merged_manifest.csv')
contrib_groupings <- read.csv('text_analysis/0203_contributing_merged_manifest.csv')
readme_groupings <- read.csv('text_analysis/0207_readme_merged_manifest.csv')
contrib_groupings <- read.csv('text_analysis/0207_contributing_merged_manifest.csv')
contrib_groupings$filename <- contrib_groupings$fvf_filepath
readme_groupings$filename <- readme_groupings$fvf_filepath
readme_textstat <- read.csv('text_analysis/020325_README_readability.csv')
contributing_textstat <- read.csv('text_analysis/020125_CONTRIBUTING_readability.csv')
readme_textstat <- read.csv('text_analysis/020725_README_readability.csv')
contributing_textstat <- read.csv('text_analysis/020725_CONTRIBUTING_readability.csv')
doctypeColors <-
setNames( c('#5da2d8', '#c7756a')
, c("CONTRIBUTING", "README"))
setNames( c('#c7756a','#5da2d8')
, c("README","CONTRIBUTING"))
readme_textstat$type = "README"
contributing_textstat$type = "CONTRIBUTING"
all_df = rbind(readme_textstat, contributing_textstat)
all_df = rbind(contributing_textstat,readme_textstat)
all_df$type <- factor(all_df$type, levels = c("CONTRIBUTING", "README"))
length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) +
geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+
scale_fill_manual(values = doctypeColors) +
xlim(-10, 500) +
xlim(-10, 600) +
labs(
x = "Word Count",
y = "Density Across Documents",
@ -26,10 +28,10 @@ length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) +
theme(legend.position = "top")
length_plot_all
#ggsave(filename = "plots/cr-0203-wc-density.png", plot = length_plot_all, width = 9, height = 9, dpi = 800)
#ggsave(filename = "plots/cr-0207-wc-density.png", plot = length_plot_all, width = 9, height = 9, dpi = 800)
contributing_df <- inner_join(contributing_textstat, contrib_groupings, by="filename")
readme_df <- inner_join(readme_textstat, readme_groupings, by="filename")
contributing_df <- inner_join(contributing_textstat, contrib_groupings, by=c("filename"="new_filepath"))
readme_df <- inner_join(readme_textstat, readme_groupings, by=c("filename"="new_filepath"))
subdirColors <-
setNames( c('#31449c', '#4a7c85', '#c5db68')
@ -42,7 +44,7 @@ contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, gr
labs(x= NULL, y= NULL, fill="RE Grouping")+
theme_bw() +
theme(legend.position = "inside",
legend.position.inside = c(.90, .90),
legend.position.inside = c(.89, .92),
legend.justification = c("right", "top"),
legend.direction = "horizontal",
legend.margin = margin(6, 6, 6, 6))
@ -64,7 +66,7 @@ readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.facto
labs(x= "Reading Time (s)", y= NULL)+
guides(fill="none", color="none")+
theme_bw()
#readme_reading_time_plot
readme_reading_time_plot
readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(ranef_grouping))) +
geom_density(aes(fill=as.factor(ranef_grouping)), position="fill") +
@ -73,7 +75,7 @@ readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.fac
labs(x= "Flesch Reading Ease", y= "README Density")+
guides(fill="none", color="none")+
theme_bw()
#readme_reading_ease
readme_reading_ease
library(gridExtra)
grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2)

View File

@ -3,15 +3,20 @@ library(lubridate)
library(rdd)
library(stringr)
contributing_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv"
contributing_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_CONTRIBUTING_weekly_count_data.csv"
contributing_count_df = read.csv(contributing_count_data_filepath, header = TRUE)
contributing_topic_dist_filepath <- "text_analysis/020125_CONTRIBUTING_file_topic_distributions.csv"
contributing_topic_dist_filepath <- "text_analysis/020725_CONTRIBUTING_file_topic_distributions.csv"
contributing_topics_df = read.csv(contributing_topic_dist_filepath, header = TRUE)
contributing_merged_manifest <- "text_analysis/0207_contributing_merged_manifest.csv"
contributing_manifest_df <- read.csv(contributing_merged_manifest, header=TRUE)
merged_df <- inner_join(contributing_manifest_df, contributing_topics_df, by=c("new_filepath"= "filename"))
window_num <- 5
contributing_count_df <- contributing_count_df |>
filter(week_index >= (- window_num) & week_index <= (window_num)) |>
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
mutate(scaled_age = scale(age)) |>
mutate(scaled_age_at_commit = scale(age_at_commit))|>
mutate(log1p_count = log1p(commit_count))
@ -21,12 +26,7 @@ summed_data <- contributing_count_df |>
group_by(project_id) |>
summarise_at(vars(commit_count), list(summed_count=sum))
contributing_topics_df <- contributing_topics_df |>
mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |>
mutate(project_id = ifelse(filename=="_vcr_vcr_CONTRIBUTING.md", "vcr_vcr", project_id)) |>
mutate(project_id = ifelse(filename=="marshmallow-code_marshmallow.git_CONTRIBUTING.rst", "marshmallow-code_marshmallow.git", project_id))
merged_df <- inner_join(summed_data, contributing_topics_df, by="project_id")
merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id"))
merged_df$logged_commits <- log1p(merged_df$summed_count)
library(MASS)
@ -34,4 +34,4 @@ library(MASS)
commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4, data=merged_df)
qqnorm(residuals(commit_outcome_model))
summary(commit_outcome_model)
saveRDS(commit_outcome_model, "020325_commit_topic_model.rda")
saveRDS(commit_outcome_model, "020725_commit_topic_model.rda")

View File

@ -3,15 +3,20 @@ library(lubridate)
library(rdd)
library(stringr)
readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv"
readme_count_data_filepath <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/final_0207_README_weekly_count_data.csv"
readme_count_df = read.csv(readme_count_data_filepath, header = TRUE)
readme_topic_dist_filepath <- "text_analysis/020325_README_file_topic_distributions.csv"
readme_topic_dist_filepath <- "text_analysis/020725_README_file_topic_distributions.csv"
readme_topics_df = read.csv(readme_topic_dist_filepath, header = TRUE)
readme_merged_manifest <- "text_analysis/0207_readme_merged_manifest.csv"
readme_manifest_df <- read.csv(readme_merged_manifest, header=TRUE)
merged_df <- inner_join(readme_manifest_df, readme_topics_df, by=c("new_filepath"= "filename"))
window_num <- 5
readme_count_df <- readme_count_df |>
filter(week_index >= (- window_num) & week_index <= (window_num)) |>
filter(relative_week >= (- window_num) & relative_week <= (window_num)) |>
mutate(scaled_age = scale(age)) |>
mutate(scaled_age_at_commit = scale(age_at_commit))|>
mutate(log1p_count = log1p(commit_count))
@ -21,19 +26,15 @@ summed_data <- readme_count_df |>
group_by(project_id) |>
summarise_at(vars(commit_count), list(summed_count=sum))
readme_topics_df <- readme_topics_df |>
mutate(project_id = sapply(str_split(filename, "_hullabaloo_"), `[`, 1)) |>
mutate(project_id = ifelse(filename=="jaraco_keyrings.alt_hullabaloo_README.rst", "jaraco_keyrings.alt", project_id)) |>
mutate(project_id = ifelse(filename=="_vcr_vcr_README.md", "vcr_vcr", project_id))
#loss of jaraco_keyring, though jaraco keyrings.alt is represented
merged_df <- inner_join(summed_data, readme_topics_df, by="project_id")
merged_df <- inner_join(summed_data, merged_df, by=c("project_id" = "repo_id"))
merged_df$logged_commits <- log1p(merged_df$summed_count)
library(MASS)
commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9 + t10, data=merged_df)
commit_outcome_model <- glm.nb(logged_commits ~ 0 + t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8, data=merged_df)
qqnorm(residuals(commit_outcome_model))
summary(commit_outcome_model)
saveRDS(commit_outcome_model, "020325_CONTRIBUTING_commit_topic_model.rda")
saveRDS(commit_outcome_model, "020725_README_commit_topic_model.rda")