diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/R/.Rhistory b/R/.Rhistory index 5426cf6..2e1d1a7 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,512 +1,512 @@ -summary(lm1) -lm1 <- glm.nb(after_contrib_new ~ reading_time, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(after_contrib_new ~ flesch_reading_ease, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -contrib_readability_df <- contrib_readability_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -head(parts, -1) -} else { -NA_character_ -} -})) -parts[1] + parts[2] -contrib_readability_df <- contrib_readability_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -parts[1] + parts[2] -} else { -NA_character_ -} -})) -contrib_readability_df <- contrib_readability_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -paste(head(parts, -1), collapse="") -} else { -NA_character_ -} -})) -View(contrib_readability_df) -#libraries -library(stringr) -contrib_df <- read_csv("../final_data/deb_contrib_did.csv") -contrib_pop_df <- read_csv("../final_data/deb_contrib_pop_change.csv") -contrib_readability_df <- read_csv('../text_analysis/dwo_readability_contributing.csv') -contrib_pop_df <- contrib_pop_df |> -mutate(project_name = map_chr(upstream_vcs_link, ~ { -parts <- str_split(.x, pattern = "/")[[1]] -if (length(parts) >= 1) { -parts[length(parts)] -} else { -NA_character_ -} -})) -contrib_readability_df <- contrib_readability_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -paste(head(parts, -1), collapse="_") -} else { -NA_character_ -} -})) -contrib_total_df <- contrib_pop_df |> -join(contrib_readability_df, by="project_name") -View(contrib_total_df) -# test regressions -library(MASS) -lm1 <- glm.nb(after_contrib_new ~ flesch_reading_ease, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(after_contrib_new ~ flesch_reading_ease + age_in_days, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -View(contrib_df) -source("~/Desktop/git/24_deb_gov/R/contrib_docChar_outcomes.R") -View(windowed_data) -View(windowed_data) -summed_data <- windowed_data |> -group_by(upstream_vcs_link) |> -summarize(total_ct_after_all = sum(ct_after_all)) -summed_data <- windowed_data |> -filter(window="ct_after_all") |> -group_by(upstream_vcs_link) |> -summarize(total_ct_after_all = sum(count)) -summed_data <- windowed_data |> -filter(window=="ct_after_all") |> -group_by(upstream_vcs_link) |> -summarize(total_ct_after_all = sum(count)) -View(summed_data) -summed_data <- windowed_data |> -filter(window=="ct_after_all") |> -group_by(upstream_vcs_link) |> -mutate(total_ct_after_all = sum(count)) -View(summed_data) -summed_data <- windowed_data |> -filter(window=="ct_after_all") |> -group_by(upstream_vcs_link) |> -summarize(total_ct_after_all = sum(count)) |> ungroup() -View(summed_data) -View(windowed_data) -summed_data <- windowed_data |> -filter(window=="ct_after_all") |> -group_by(upstream_vcs_link) |> -summarise_at(vars(count), list(name=sum)) -View(summed_data) -summed_data <- windowed_data |> -filter(D==1) |> -group_by(upstream_vcs_link) |> -summarise_at(vars(count), list(summed_count=sum)) -View(summed_data) -source("~/Desktop/git/24_deb_gov/R/contrib_docChar_outcomes.R") -contrib_total_df <- contrib_total_df|> -join(summed_data, by=upstream_vcs_link) -contrib_total_df <- contrib_pop_df |> -join(contrib_readability_df, by="project_name") -View(contrib_total_df) -contrib_total_df <- contrib_total_df|> -join(summed_data, by=upstream_vcs_link) -View(summed_data) -contrib_total_df <- contrib_total_df|> -join(summed_data, by="upstream_vcs_link") -View(contrib_total_df) -View(contrib_df) -source("~/Desktop/git/24_deb_gov/R/contrib_docChar_outcomes.R") -#outcome variable that is number of commits by number of new contributors -contrib_total_df$commit_by_contrib = contrib_total_df$summed_count * contrib_total_df$after_contrib_new -# test regressions -library(MASS) -lm1 <- glm.nb(after_contrib_new ~ flesch_reading_ease + age_in_days, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ flesch_reading_ease + age_in_days, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -View(contrib_total_df) -lm1 <- glm.nb(commit_by_contrib ~ word_count, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -contrib_total_df$scaled_outcome = scale(contrib_total_df$commit_by_contrib) -lm1 <- glm.nb(scaled_outcome ~ word_count + flesch_kincaid, data = contrib_total_df) -lm1 <- glm.nb(scaled_outcome ~ word_count + flesch_kincaid_grade, data = contrib_total_df) -contrib_total_df$logged_outcome = log1p(contrib_total_df$commit_by_contrib) -# test regressions -library(MASS) -lm1 <- glm.nb(scaled_outcome ~ word_count + flesch_kincaid_grade, data = contrib_total_df) -lm1 <- glm.nb(logged_outcome ~ word_count + flesch_kincaid_grade, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ word_count + flesch_kincaid_grade + linsear_write_formula, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -contrib_total_df$scaled_outcome = scale(contrib_total_df$commit_by_contrib) -# test regressions -library(MASS) -lm1 <- lm(scaled_outcome ~ word_count + flesch_kincaid_grade + linsear_write_formula, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ word_count + flesch_kincaid_grade + linsear_write_formula, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ word_count + flesch_kincaid_grade + linsear_write_formula + mcalpine_eflaw, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ word_count + flesch_kincaid_grade + linsear_write_formula + mcalpine_eflaw + dale_chall_readability_score, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ word_count + dale_chall_readability_score, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ word_count + reading_time, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ word_count + reading_time, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ word_count + flesch_kincaid_grade, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -#libraries -library(stringr) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -readme_pop_df <- read_csv("../final_data/deb_readme_pop_change.csv") -readme_readability_df <- read_csv('../text_analysis/dwo_readability_readmeuting.csv') -source("~/Desktop/git/24_deb_gov/R/readme_docChar_outcomes.R") -source("~/Desktop/git/24_deb_gov/R/readme_docChar_outcomes.R") -lm1 <- glm.nb(commit_by_readme ~ word_count + flesch_kincaid_grade, data = readme_total_df) -View(readme_readability_df) -readme_readability_df <- readme_readability_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -paste(head(parts, -1), collapse="_") -} else { -NA_character_ -} -})) -readme_total_df <- readme_pop_df |> -join(readme_readability_df, by="project_name") -readme_total_df <- readme_total_df|> -join(summed_data, by="upstream_vcs_link") -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_readme = readme_total_df$summed_count * readme_total_df$after_readme_new -readme_total_df$logged_outcome = log(readme_total_df$commit_by_readme) -View(readme_total_df) -View(readme_total_df) -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_readme = readme_total_df$summed_count * readme_total_df$after_readme_new -View(readme_total_df) -View(readme_readability_df) -readme_pop_df[readme_pop_df['upstream_vcs_link'] == "https://github.com/agateau/yokadi/issues/new", "project_name"] = "yokadi" -View(readme_pop_df) -readme_pop_df[readme_pop_df['upstream_vcs_link'] == "https://github.com/SciRuby/rb-gsl/issues/new", "project_name"] = "rb-gsl" -source("~/Desktop/git/24_deb_gov/R/readme_docChar_outcomes.R") -readme_readability_df <- readme_readability_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -paste(head(parts, -1), collapse="_") -} else { -NA_character_ -} -})) -readme_readability_df[readme_readability_df['filename'] == "yder_README_8md.html", "project_name"] = "yder" -readme_readability_df[readme_readability_df['filename'] == "pg_filedump.git_README.pg_filedump", "project_name"] = "pg_filedump.git" -readme_readability_df[readme_readability_df['filename'] == "openvas_UPGRADE_README", "project_name"] = "openvas" -readme_readability_df[readme_readability_df['filename'] == "hyphen.git_README_hyph_en_US.txt", "project_name"] = "hyphen.git" -readme_readability_df[readme_readability_df['filename'] == "cycle.git_README_ru.html", "project_name"] = "cycle.git" -readme_readability_df[readme_readability_df['filename'] == "diffuse.git_README_ru", "project_name"] = "diffuse.git" -readme_readability_df[readme_readability_df['filename'] == "CheMPS2_README_8md_source.html", "project_name"] = "CheMPS2" -readme_readability_df[readme_readability_df['filename'] == "sleuthkit_README_win32.txt", "project_name"] = "sleuthkit" -readme_readability_df[readme_readability_df['filename'] == "Lmod_README_lua_modulefiles.txt", "project_name"] = "Lmod" -readme_readability_df[readme_readability_df['filename'] == "engauge_debian_README_for_osx", "project_name"] = "engauge_debian" -readme_total_df <- readme_pop_df |> -join(readme_readability_df, by="project_name") -readme_total_df <- readme_total_df|> -join(summed_data, by="upstream_vcs_link") -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_readme = readme_total_df$summed_count * readme_total_df$after_readme_new -View(readme_total_df) -readme_total_df$logged_outcome = log(readme_total_df$commit_by_readme) -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_readme = readme_total_df$summed_count * readme_total_df$after_readme_new -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_contrib = readme_total_df$summed_count * readme_total_df$after_readme_new -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_contrib = NA -readme_total_df$commit_by_contrib = readme_total_df$summed_count * readme_total_df$after_readme_new -View(readme_total_df) -View(readme_total_df) -readme_total_df$commit_by_contrib = readme_total_df$summed_count * readme_total_df$after_contrib_new -lm1 <- glm.nb(commit_by_contrib ~ word_count + flesch_kincaid_grade, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -readme_total_df$logged_outcome = log(readme_total_df$commit_by_readme) -readme_total_df$logged_outcome = log(readme_total_df$commit_by_contrib) -lm1 <- glm.nb(commit_by_contrib ~ word_count + flesch_kincaid_grade, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ word_count + flesch_kincaid_grade, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ word_count + flesch_kincaid_grade, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(after_contrib_new ~ word_count + flesch_kincaid_grade, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(after_contrib_new ~ word_count + reading_time, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ word_count + reading_time, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(reading_time ~ word_count , data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -View(readme_total_df) -lm1 <- glm.nb(reading_time ~ flesch_reading_ease , data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(flesch_reading_ease ~ reading_time , data = readme_total_df) -lm1 <- glm.nb(commit_by_contrib ~ reading_time , data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ reading_time + linsear_write_formula , data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -readme_total_df$commit_by_contrib = readme_total_df$summed_count * (readme_total_df$after_contrib_new + 1) -readme_total_df$logged_outcome = log(readme_total_df$commit_by_contrib) -lm1 <- glm.nb(commit_by_contrib ~ reading_time + linsear_write_formula , data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -readme_total_df$logged_outcome = log1p(readme_total_df$commit_by_contrib) -lm1 <- glm.nb(logged_outcome ~ reading_time + linsear_write_formula , data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ reading_time + linsear_write_formula + flesch_reading_ease, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome ~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(logged_outcome~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -source("~/Desktop/git/24_deb_gov/R/contrib_docChar_outcomes.R") -lm1 <- glm.nb(logged_outcome~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = contrib_total_df) -contrib_total_df$logged_outcome = log1p(contrib_total_df$commit_by_contrib) -lm1 <- glm.nb(logged_outcome ~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -contrib_topics_df <- read_csv("../text_analysis/contrib_file_topic_distributions.csv") +, c(0,1,2) ) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g +subdirColors <- +setNames( c('#942e55', '#78c58a', '#9b6e29') +, c(0,1,2) ) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g +subdirColors <- +setNames( c('#a1a596', '#557784', '#2f6382') +, c(0,1,2) ) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g +subdirColors <- +setNames( c('#a3b0c9', '#101f31', '#28578d') +, c(0,1,2) ) +readme_g <- readme_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors) + +guides(fill="none", color="none")+ +theme_bw() + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +readme_g +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g +subdirColors <- +setNames( c('#f8f06b', '#ca7780', '#a13795') +, c(0,1,2) ) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g library(tidyverse) -contrib_topics_df <- read_csv("../text_analysis/contrib_file_topic_distributions.csv") -View(contrib_topics_df) -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -source("~/Desktop/git/24_deb_gov/R/contrib_docChar_outcomes.R") -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -lm1 <- glm.nb(summed_count ~ t0 + t1 + t2 + t3, data = contrib_total_df) -#running regressions -library(MASS) -lm1 <- glm.nb(summed_count ~ t0 + t1 + t2 + t3, data = contrib_total_df) -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -library(stringr) library(plyr) -contrib_topics_df <- read_csv("../text_analysis/contrib_file_topic_distributions.csv") -contrib_df <- read_csv("../final_data/deb_contrib_did.csv") -contrib_pop_df <- read_csv("../final_data/deb_contrib_pop_change.csv") -#get the contribution count -#some preprocessing and expansion -col_order <- c("upstream_vcs_link", "age_in_days", "first_commit", "first_commit_dt", "event_gap", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -contrib_df <- contrib_df[,col_order] -contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") -contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ") -contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ") -contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -contrib_df = contrib_df[,!(names(contrib_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -summed_data <- windowed_data |> -filter(D==1) |> -group_by(upstream_vcs_link) |> -summarise_at(vars(count), list(summed_count=sum)) -#concat dataframes into central data -contrib_pop_df <- contrib_pop_df |> -mutate(project_name = map_chr(upstream_vcs_link, ~ { -parts <- str_split(.x, pattern = "/")[[1]] -if (length(parts) >= 1) { -parts[length(parts)] -} else { -NA_character_ -} -})) -contrib_topic_df <- contrib_topic_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -paste(head(parts, -1), collapse="_") -} else { -NA_character_ -} -})) -contrib_topics_df <- contrib_topics_df |> -mutate(project_name = map_chr(filename, ~ { -parts <- str_split(.x, pattern = "_")[[1]] -if (length(parts) >= 1) { -paste(head(parts, -1), collapse="_") -} else { -NA_character_ -} -})) -contrib_total_df <- contrib_pop_df |> -join(contrib_topics_df, by="project_name") -contrib_total_df <- contrib_total_df|> -join(summed_data, by="upstream_vcs_link") -#outcome variable that is number of commits by number of new contributors -contrib_total_df$commit_by_contrib = contrib_total_df$summed_count * contrib_total_df$after_contrib_new -contrib_total_df$logged_outcome = log1p(contrib_total_df$commit_by_contrib) -#running regressions -library(MASS) -lm1 <- glm.nb(summed_count ~ t0 + t1 + t2 + t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -View(contrib_total_df) -lm1 <- glm.nb(summed_count ~ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ t3 + t2 + t1, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ t3 + t2 + t1 + t0, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ t0 + t1 + t2 + t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- lm(summed_count ~ t0 + t1 + t2 + t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ t0 + t1 + t2 + t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ t1 + t2 + t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(summed_count ~ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t1 + t2 +t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t2, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t1, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -source("~/Desktop/git/24_deb_gov/R/readme_topic_outcomes.R") -#outcome variable that is number of commits by number of new readmeutors -readme_total_df$commit_by_contrib = readme_total_df$summed_count *readme_total_df$after_contrib_new -readme_total_df$logged_outcome = log1p(readme_total_df$commit_by_readme) -#running regressions -library(MASS) -lm1 <- glm.nb(commit_by_readme ~ t3, data = readme_total_df) -lm1 <- glm.nb(commit_by_contrib ~ t3, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t1, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t7, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t7, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t2+t7, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t2+t7+t3, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t2+t7+t3 + t4, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t2+t7+t3 +t4 + t5 + t6, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t2+t7+t3 +t4 + t5, data = readme_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -source("~/Desktop/git/24_deb_gov/R/contrib_topic_outcomes.R") -lm1 <- glm.nb(commit_by_contrib ~ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+ t1+ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t0+ t1+ t2+ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t1+ t2+ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t1+ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) -lm1 <- glm.nb(commit_by_contrib ~ t2+ t3, data = contrib_total_df) -qqnorm(residuals(lm1)) -summary(lm1) +library(gridExtra) +library(ggpubr) +# script for the analysis of document readability metrics +# readability metrics will be studied controlled by their length +# gaughan@u.northwestern.edu +# loading in the data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") +contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv") +# establishing the color scheme +subdirColors <- +setNames( c('#f8f06b', '#ca7780', '#a13795') +, levels(contributing_df$subdir) ) +readmeSubdirColors <- +setNames( c('#ca7780', '#a13795') +, levels(readme_df$subdir) ) +#plotting linsear scoring +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.35, position="identity") + +xlim(-30, 30) + +theme_bw() +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Flesch Reading Ease", y= "README Density")+ +guides(fill="none", color="none")+ +theme_bw() +readme_reading_ease +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.8, position="fill") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Reading Time (s)", y= NULL)+ +guides(fill="none", color="none")+ +theme_bw() +readme_reading_time_plot +readme_reading_time_no_group <- ggplot(readme_df, aes(x=reading_time)) + +geom_histogram(fill='forestgreen') + +xlim(-5, 190) + +ylab("Count of README Files") + +xlab("Reading Time (s)") + +ggtitle("Reading Time for README files from FLOSS Projects (n=2280)")+ +guides(fill="none", color="none")+ +theme_bw() +readme_reading_time_no_group +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Linsear Write Score", y= NULL)+ +guides(fill="none", color="none")+ +theme_bw() +readme_linsear_plot +readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "McAlpine EFLAW", y= NULL)+ +guides(fill="none", color="none")+ +theme_bw() +#theme(axis.title.y=element_blank()) +#plot of reading_ease +#readme_df <- readme_df |> +# mutate(coef_grouping <- as.factor(subdir)) +#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) +#summary(test_lm) +aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) +# plotting contributing linsear writing formula +contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +labs(x= NULL, y= NULL, fill="RE Grouping")+ +xlim(-5, 90) + +theme_bw() + +guides(fill="none", color="none") +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +xlim(-5, 90) + +labs(x= NULL, y= NULL, fill="RE Grouping")+ +theme_bw() + +theme(legend.position = "inside", +legend.position.inside = c(.93, .93), +legend.justification = c("right", "top"), +legend.direction = "horizontal", +legend.margin = margin(6, 6, 6, 6)) +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +xlim(-5, 90) + +labs(x= NULL, y= NULL, fill="RE Grouping")+ +theme_bw() + +theme(legend.position = "inside", +legend.position.inside = c(.93, .93), +legend.justification = c("right", "top"), +legend.direction = "vertical", +legend.margin = margin(6, 6, 6, 6)) +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ +xlim(-5, 90) + +theme_bw() + +guides(fill="none", color="none") +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot,contributing_mcalpine_eflaw, readme_reading_ease, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) +readme_df$type = "README" +contributing_df$type = "CONTRIBUTING" +all_df = rbind(readme_df, contributing_df) +length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + +geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.4, position="identity")+ +xlim(-10, 500) + +labs( +x = "Word Count", +y = "Density Across Documents", +fill="Document Type" +) + +theme_bw() + +theme(legend.position = "top") +length_plot_all +grid.arrange(contributing_reading_ease, contributing_linsear_plot,contributing_mcalpine_eflaw, readme_reading_ease, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) +length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + +geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.4, position="identity")+ +xlim(-10, 500) + +labs( +x = "Word Count", +y = "Density Across Documents", +fill="Document Type" +) + +theme_bw() + +theme(legend.position = "top") +length_plot_all +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +xlim(-5, 90) + +labs(x= NULL, y= NULL, fill="RE Grouping")+ +theme_bw() + +theme(legend.position = "inside", +legend.position.inside = c(.93, .93), +legend.justification = c("right", "top"), +legend.direction = "horizontal", +legend.margin = margin(6, 6, 6, 6)) +contributing_reading_time_plot +grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Reading Time (s)", y= NULL)+ +guides(fill="none", color="none")+ +theme_bw() +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Flesch Reading Ease", y= "README Density")+ +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ +xlim(-5, 90) + +theme_bw() + +guides(fill="none", color="none") +grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) +# establishing the color scheme +subdirColors <- +setNames( c('#68293c', '#ffcf67', '#91d8f0') +, levels(contributing_df$subdir) ) +readmeSubdirColors <- +setNames( c('#ffcf67', '#91d8f0') +, levels(readme_df$subdir) ) +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Flesch Reading Ease", y= "README Density")+ +guides(fill="none", color="none")+ +theme_bw() +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +scale_fill_manual(values = readmeSubdirColors) + +xlim(-5, 90) + +labs(x= "Reading Time (s)", y= NULL)+ +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +xlim(-5, 90) + +labs(x= NULL, y= NULL, fill="RE Grouping")+ +theme_bw() + +theme(legend.position = "inside", +legend.position.inside = c(.93, .93), +legend.justification = c("right", "top"), +legend.direction = "horizontal", +legend.margin = margin(6, 6, 6, 6)) +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(fill=as.factor(subdir)), position="fill") + +scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ +xlim(-5, 90) + +theme_bw() + +guides(fill="none", color="none") +grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) +source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R") +subdirColors <- +setNames( c('#31449c', '#4a7c85', '#c5db68') +, c(0,1,2) ) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g +contrib_groupings <- read.csv('../final_data/deb_contrib_interaction_groupings.csv') +subdirColors <- +setNames( c('#31449c', '#4a7c85', '#c5db68') +, c(0,1,2) ) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + +theme_bw() + +theme(legend.position = "top") + +labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") +contrib_g +source("~/Desktop/git/24_deb_gov/R/gam_plot_documents.R") +doctypeColors <- +setNames( c('#4a7c85', '#c5db68') +, factor(all_actions_data$document_type)) +View(all_actions_data) +doctypeColors <- +setNames( c('#4a7c85', '#c5db68') +, c("CONTRIBUTING", "README")) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_fill_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +doctypeColors <- +setNames( c('#ffcf67', '#91d8f0') +, c("CONTRIBUTING", "README")) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +doctypeColors <- +setNames( c('#7d1b16', '#263b90') +, c("CONTRIBUTING", "README")) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +doctypeColors <- +setNames( c('#995223', '#2464ad') +, c("CONTRIBUTING", "README")) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +doctypeColors <- +setNames( c('#ba6b44', '#5d7fbd') +, c("CONTRIBUTING", "README")) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +doctypeColors <- +setNames( c('#5da2d8', '#c7756a') +, c("CONTRIBUTING", "README")) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=count, color=factor(document_type))) + +scale_y_log1p() + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), +labels = round(c(expm1(0), expm1(0.5), expm1(1.0), exp,1(1.5)), 1)) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), +labels = round(c(expm1(0), expm1(0.5), expm1(1.0), exp,1(1.5)), 1)) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), +labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + +labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), +labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + +labs(x="Weekly Offset", y="Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), +labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + +labs(x="Weekly Offset", y="Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +labs(x="Weekly Offset", y="Commit Count", color="Document Type") + +scale_color_manual(values = doctypeColors) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R") +length_plot_all +length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + +geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.5, position="identity")+ +scale_fill_manual(values = doctypeColors) + +xlim(-10, 500) + +labs( +x = "Word Count", +y = "Density Across Documents", +fill="Document Type" +) + +theme_bw() + +theme(legend.position = "top") +length_plot_all +length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + +geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+ +scale_fill_manual(values = doctypeColors) + +xlim(-10, 500) + +labs( +x = "Word Count", +y = "Density Across Documents", +fill="Document Type" +) + +theme_bw() + +theme(legend.position = "top") +length_plot_all diff --git a/R/072524_contirbuting_ranef_plot.png b/R/072524_contirbuting_ranef_plot.png new file mode 100644 index 0000000..1c9348e Binary files /dev/null and b/R/072524_contirbuting_ranef_plot.png differ diff --git a/R/072524_contributing_ranef_plot.png b/R/072524_contributing_ranef_plot.png new file mode 100644 index 0000000..1c9348e Binary files /dev/null and b/R/072524_contributing_ranef_plot.png differ diff --git a/R/072524_gam_document introduction.png b/R/072524_gam_document introduction.png new file mode 100644 index 0000000..e254a24 Binary files /dev/null and b/R/072524_gam_document introduction.png differ diff --git a/R/072524_readability_density.png b/R/072524_readability_density.png new file mode 100644 index 0000000..d38f0a2 Binary files /dev/null and b/R/072524_readability_density.png differ diff --git a/R/0725_topic_commitoutcome_contrib.rda b/R/0725_topic_commitoutcome_contrib.rda new file mode 100644 index 0000000..c08890b Binary files /dev/null and b/R/0725_topic_commitoutcome_contrib.rda differ diff --git a/R/0725_topic_commitoutcome_readme.rda b/R/0725_topic_commitoutcome_readme.rda new file mode 100644 index 0000000..1efbd95 Binary files /dev/null and b/R/0725_topic_commitoutcome_readme.rda differ diff --git a/R/0725_topic_contriboutcome_contrib.rda b/R/0725_topic_contriboutcome_contrib.rda new file mode 100644 index 0000000..f852940 Binary files /dev/null and b/R/0725_topic_contriboutcome_contrib.rda differ diff --git a/R/0725_topic_contriboutcome_readme.rda b/R/0725_topic_contriboutcome_readme.rda new file mode 100644 index 0000000..3d472ad Binary files /dev/null and b/R/0725_topic_contriboutcome_readme.rda differ diff --git a/R/073024_fossy_plot.png b/R/073024_fossy_plot.png new file mode 100644 index 0000000..1933493 Binary files /dev/null and b/R/073024_fossy_plot.png differ diff --git a/R/0731_topic_commitoutcome_contrib.rda b/R/0731_topic_commitoutcome_contrib.rda new file mode 100644 index 0000000..97a5ff4 Binary files /dev/null and b/R/0731_topic_commitoutcome_contrib.rda differ diff --git a/R/0731_topic_contriboutcome_contrib.rda b/R/0731_topic_contriboutcome_contrib.rda new file mode 100644 index 0000000..1881d7e Binary files /dev/null and b/R/0731_topic_contriboutcome_contrib.rda differ diff --git a/R/082424_contributing_ranef_plot.png b/R/082424_contributing_ranef_plot.png new file mode 100644 index 0000000..e1cfa91 Binary files /dev/null and b/R/082424_contributing_ranef_plot.png differ diff --git a/R/082424_doctype_wordcount.png b/R/082424_doctype_wordcount.png new file mode 100644 index 0000000..742e95b Binary files /dev/null and b/R/082424_doctype_wordcount.png differ diff --git a/R/082424_gam_document_introduction.png b/R/082424_gam_document_introduction.png new file mode 100644 index 0000000..37ecf5b Binary files /dev/null and b/R/082424_gam_document_introduction.png differ diff --git a/R/082424_readability_density.png b/R/082424_readability_density.png new file mode 100644 index 0000000..acb9177 Binary files /dev/null and b/R/082424_readability_density.png differ diff --git a/R/FOSSY_contributing_doc_intro.png b/R/FOSSY_contributing_doc_intro.png new file mode 100644 index 0000000..7e36bb6 Binary files /dev/null and b/R/FOSSY_contributing_doc_intro.png differ diff --git a/R/FOSSY_presentation_README_rt.png b/R/FOSSY_presentation_README_rt.png new file mode 100644 index 0000000..82d2dbd Binary files /dev/null and b/R/FOSSY_presentation_README_rt.png differ diff --git a/R/R.Rproj b/R/R.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/R/R.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/R/contribRDDAnalysis.R b/R/contribRDDAnalysis.R index d934e1c..1d9671e 100644 --- a/R/contribRDDAnalysis.R +++ b/R/contribRDDAnalysis.R @@ -45,9 +45,15 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg") all_actions_data$logged_count <- log(all_actions_data$count) all_actions_data$log1p_count <- log1p(all_actions_data$count) #EDA +sd(all_actions_data$count) +grouped_averages <- aggregate(all_actions_data$count, list(all_actions_data$upstream_vcs_link), mean) +quantile(grouped_averages$x) +quantile(all_actions_data$before_auth_new) +quantile(all_actions_data$after_auth_new) + range(all_actions_data$log1p_count) # 0.000000 6.745236 mean(all_actions_data$log1p_count) # 1.200043 -var(all_actions_data$log1p_count) # 1.753764 +sd(all_actions_data$log1p_count) median(all_actions_data$log1p_count) # 0.6931472 # now for merge mrg_actions_data$logged_count <- log(mrg_actions_data$count) @@ -58,12 +64,12 @@ library(optimx) library(lattice) #model print("fitting model") -all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + scaled_event_gap + (D * week_offset | upstream_vcs_link), - control=glmerControl(optimizer="bobyqa", - optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data) +#all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + scaled_event_gap + (D * week_offset | upstream_vcs_link), +# control=glmerControl(optimizer="bobyqa", +# optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data) #all_gmodel <- readRDS("0711_contrib_all.rda") summary(all_gmodel) -saveRDS(all_gmodel, "0711_contrib_all_01.rda") +#saveRDS(all_gmodel, "0711_contrib_all_01.rda") all_residuals <- residuals(all_gmodel) qqnorm(all_residuals) diff --git a/R/contrib_docChar_outcomes.R b/R/contrib_docChar_outcomes.R index d58ff33..1d9630d 100644 --- a/R/contrib_docChar_outcomes.R +++ b/R/contrib_docChar_outcomes.R @@ -1,5 +1,7 @@ #libraries -library(stringr) +library(readr) +library(tidyverse) +library(plyr) contrib_df <- read_csv("../final_data/deb_contrib_did.csv") contrib_pop_df <- read_csv("../final_data/deb_contrib_pop_change.csv") contrib_readability_df <- read_csv('../text_analysis/dwo_readability_contributing.csv') @@ -68,10 +70,12 @@ contrib_total_df <- contrib_pop_df |> contrib_total_df <- contrib_total_df|> join(summed_data, by="upstream_vcs_link") #outcome variable that is number of commits by number of new contributors -contrib_total_df$commit_by_contrib = contrib_total_df$summed_count * contrib_total_df$after_contrib_new +contrib_total_df$commit_by_contrib = contrib_total_df$summed_count + contrib_total_df$after_contrib_new * 2 contrib_total_df$logged_outcome = log1p(contrib_total_df$commit_by_contrib) +contrib_total_df$logged_contribs = log1p(contrib_total_df$after_contrib_new) +contrib_total_df$logged_commits = log1p(contrib_total_df$summed_count) # test regressions library(MASS) -lm1 <- glm.nb(summed_count ~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = contrib_total_df) +lm1 <- glm.nb(logged_contribs ~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = contrib_total_df) qqnorm(residuals(lm1)) summary(lm1) diff --git a/R/contrib_topic_outcomes.R b/R/contrib_topic_outcomes.R index fdc0c97..9b78b98 100644 --- a/R/contrib_topic_outcomes.R +++ b/R/contrib_topic_outcomes.R @@ -2,9 +2,12 @@ library(stringr) library(plyr) contrib_topics_df <- read_csv("../text_analysis/contrib_file_topic_distributions.csv") +colMeans(subset(contrib_topics_df, select = -filename)) contrib_df <- read_csv("../final_data/deb_contrib_did.csv") contrib_pop_df <- read_csv("../final_data/deb_contrib_pop_change.csv") +median(contrib_df$age_in_days) + #get the contribution count #some preprocessing and expansion col_order <- c("upstream_vcs_link", "age_in_days", "first_commit", "first_commit_dt", "event_gap", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") @@ -73,8 +76,19 @@ contrib_total_df <- contrib_total_df|> #outcome variable that is number of commits by number of new contributors contrib_total_df$commit_by_contrib = contrib_total_df$summed_count * contrib_total_df$after_contrib_new contrib_total_df$logged_outcome = log1p(contrib_total_df$commit_by_contrib) +contrib_total_df$logged_contrib = log1p(contrib_total_df$after_contrib_new) +contrib_total_df$logged_commits = log1p(contrib_total_df$summed_count) #running regressions library(MASS) -lm1 <- glm.nb(commit_by_contrib ~ t2+ t3, data = contrib_total_df) +contrib_ <- glm.nb(logged_contrib ~ t0 + t1 + t2 + t3, data = contrib_total_df) +commits_ <- glm.nb(logged_commits ~ t0 + t1 + t2 + t3, data = contrib_total_df) qqnorm(residuals(lm1)) -summary(lm1) +summary(contrib_) +summary(commits_) +texreg(list(contrib_, commits_), stars=NULL, digits=3, use.packages=FALSE, + custom.model.names=c( 'Contributions','Commits'), + custom.coef.names=c('(Intercept)', 'Topic 1', 'Topic 2', 'Topic 3'), + table=FALSE, ci.force = TRUE) + +saveRDS(commits_, "0731_topic_commitoutcome_contrib.rda") +saveRDS(contrib_, "0731_topic_contriboutcome_contrib.rda") diff --git a/R/delta_analysis.R b/R/delta_analysis.R index 98a09ea..60b3dfa 100644 --- a/R/delta_analysis.R +++ b/R/delta_analysis.R @@ -1,10 +1,22 @@ library(tidyverse) contrib_df <- read_csv("../final_data/deb_contrib_did.csv") readme_df <- read_csv("../final_data/deb_readme_did.csv") +median(readme_df$age_in_days) +sd(readme_df$age_in_days) + +median(contrib_df$age_in_days) +sd(contrib_df$age_in_days) + contrib_df <- contrib_df |> filter(event_gap >= 0) readme_df <- readme_df |> filter(event_gap >= 0) + +quantile(readme_df$age_in_days) +quantile(contrib_df$age_in_days) + +quantile(readme_df$event_gap) +quantile(contrib_df$event_gap) hist(readme_df$event_gap) median(readme_df$event_gap) diff --git a/R/documentReadabilityAnalysis.R b/R/documentReadabilityAnalysis.R index 47622f9..47fa3bb 100644 --- a/R/documentReadabilityAnalysis.R +++ b/R/documentReadabilityAnalysis.R @@ -10,6 +10,8 @@ try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv") head(readme_df) +quantile(readme_df$reading_time) +quantile(contributing_df$reading_time) aggregate(readme_df[, 3:10], list(readme_df$subdir), median) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) #getting basic stats for the readme readability @@ -19,45 +21,61 @@ median(readme_df$mcalpine_eflaw) median(readme_df$reading_time) # establishing the color scheme subdirColors <- - setNames( c('firebrick1', 'forestgreen', 'cornflowerblue') + setNames( c('#31449c', '#4a7c85', '#c5db68') , levels(contributing_df$subdir) ) +readmeSubdirColors <- + setNames( c('#4a7c85', '#c5db68') + , levels(readme_df$subdir) ) #plotting linsear scoring readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.35, position="identity") + xlim(-30, 30) + theme_bw() #plotting readme reading ease readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - scale_color_manual(values = subdirColors) + + geom_density(aes(fill=as.factor(subdir)), position="fill") + + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + - ylab("readme density") + + labs(x= "Flesch Reading Ease", y= "README Density")+ guides(fill="none", color="none")+ theme_bw() readme_reading_ease #plotting readme reading time readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - scale_color_manual(values = subdirColors) + + geom_density(aes(fill=as.factor(subdir)), position="fill") + + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + - ylab("readme density") + + labs(x= "Reading Time (s)", y= NULL)+ + guides(fill="none", color="none")+ + theme_bw() +readme_reading_time_plot +readme_reading_time_no_group <- ggplot(readme_df, aes(x=reading_time)) + + geom_histogram(fill='forestgreen') + + xlim(-5, 190) + + ylab("Count of README Files") + + xlab("Reading Time (s)") + + ggtitle("Reading Time for README files from FLOSS Projects (n=2280)")+ guides(fill="none", color="none")+ theme_bw() +readme_reading_time_no_group + readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - scale_color_manual(values = subdirColors) + - xlim(-5, 30) + - ylab("readme density") + + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + + scale_fill_manual(values = readmeSubdirColors) + + xlim(-5, 90) + + labs(x= "Linsear Write Score", y= NULL)+ guides(fill="none", color="none")+ theme_bw() +readme_linsear_plot readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + - scale_color_manual(values = subdirColors) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - xlim(-5, 60) + + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + + scale_fill_manual(values = readmeSubdirColors) + + xlim(-5, 90) + + labs(x= "McAlpine EFLAW", y= NULL)+ guides(fill="none", color="none")+ - theme_bw() + theme_bw() #theme(axis.title.y=element_blank()) #plot of reading_ease #readme_df <- readme_df |> @@ -72,33 +90,64 @@ median(contributing_df$reading_time) median(contributing_df$linsear_write_formula) # plotting contributing linsear writing formula contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + - scale_color_manual(values = subdirColors) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - xlim(-5, 30) + - guides(fill="none", color="none")+ - theme_bw() -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + - scale_color_manual(values = subdirColors) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - xlim(-5, 90) + - ylab("contributing density") + - guides(fill="none", color="none")+ - theme_bw() -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + - scale_color_manual(values = subdirColors) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - xlim(-5, 60) + - guides(fill="none", color="none")+ - theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + - geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + - scale_color_manual(values = subdirColors) + - ylab("contributing density") + + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + + labs(x= NULL, y= NULL, fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + - theme(legend.position = "top") + guides(fill="none", color="none") +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + + geom_density(aes(fill=as.factor(subdir)), position="fill") + + xlim(-5, 90) + + labs(x= NULL, y= NULL, fill="RE Grouping")+ + theme_bw() + + theme(legend.position = "inside", + legend.position.inside = c(.93, .93), + legend.justification = c("right", "top"), + legend.direction = "horizontal", + legend.margin = margin(6, 6, 6, 6)) +contributing_reading_time_plot +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + + xlim(-5, 90) + + labs(x= NULL, y= NULL, fill="RE Grouping")+ + theme_bw() + + theme(legend.position = "inside", + legend.position.inside = c(.93, .93), + legend.justification = c("right", "top"), + legend.direction = "vertical", + legend.margin = margin(6, 6, 6, 6)) +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + + geom_density(aes(fill=as.factor(subdir)), position="fill") + + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + + labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ + xlim(-5, 90) + + theme_bw() + + guides(fill="none", color="none") contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) +grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) + +doctypeColors <- + setNames( c('#5da2d8', '#c7756a') + , c("CONTRIBUTING", "README")) +readme_df$type = "README" +contributing_df$type = "CONTRIBUTING" +all_df = rbind(readme_df, contributing_df) +length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+ + scale_fill_manual(values = doctypeColors) + + xlim(-10, 500) + + labs( + x = "Word Count", + y = "Density Across Documents", + fill="Document Type" + ) + + theme_bw() + + theme(legend.position = "top") +length_plot_all +#grid.arrange(contributing_reading_time_plot, readme_reading_time_plot, nrow = 1) diff --git a/R/document_word_count.png b/R/document_word_count.png new file mode 100644 index 0000000..88d6176 Binary files /dev/null and b/R/document_word_count.png differ diff --git a/R/fossy24_plot.png b/R/fossy24_plot.png new file mode 100644 index 0000000..dbbaed8 Binary files /dev/null and b/R/fossy24_plot.png differ diff --git a/R/gam_plot_documents.R b/R/gam_plot_documents.R index 380b2d5..e7593eb 100644 --- a/R/gam_plot_documents.R +++ b/R/gam_plot_documents.R @@ -50,21 +50,52 @@ windowed_readme_data <- expanded_readme_data |> windowed_contrib_data$week_offset <- windowed_contrib_data$week - 27 all_actions_contrib_data <- windowed_contrib_data[which(windowed_contrib_data$observation_type == "all"),] -all_actions_contrib_data$document_type <- rep("contributing", length(all_actions_contrib_data$count)) +all_actions_contrib_data$document_type <- rep("CONTRIBUTING", length(all_actions_contrib_data$count)) windowed_readme_data$week_offset <- windowed_readme_data$week - 27 all_actions_readme_data <- windowed_readme_data[which(windowed_readme_data$observation_type == "all"),] -all_actions_readme_data$document_type <- rep("readme", length(all_actions_readme_data$count)) +all_actions_readme_data$document_type <- rep("README", length(all_actions_readme_data$count)) all_actions_data <- rbind(all_actions_contrib_data, all_actions_readme_data) all_actions_data$log1p_count <- log1p(all_actions_data$count) +library(scales) +expm1_trans <- trans_new( + name = 'expm1', + transform = function(x) expm1(x), + inverse = function(x) log1p(x) +) + +doctypeColors <- + setNames( c('#5da2d8', '#c7756a') + , c("CONTRIBUTING", "README")) + time_plot <- all_actions_data |> - ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + + ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + + labs(x="Weekly Offset", y="Commit Count", color="Document Type") + + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot +#code to change the axes + +#scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), +# labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + + #looking at event gap mean(all_actions_readme_data$event_gap) sd(all_actions_readme_data$event_gap) mean(all_actions_contrib_data$event_gap) sd(all_actions_contrib_data$event_gap) + +#all_actions_contrib_data$log1p_count <- log1p(all_actions_contrib_data$count) +#contrib_time_plot <- all_actions_contrib_data |> +# ggplot(aes(x=week_offset, y=log1p_count)) + +# geom_smooth(color=forestgreen) + +# geom_vline(xintercept = 0)+ +# annotate("text", x=3, y=1, label="CONTRIBUTING.md Publication", angle=0)+ +# theme_bw() + +# ylab("Log Transformed Count of Contributions") + +# xlab("Offset Weeks") + +# theme(legend.position = "top") +#contrib_time_plot + diff --git a/R/model_presentation.R b/R/model_presentation.R index 1aaecf7..5da839b 100644 --- a/R/model_presentation.R +++ b/R/model_presentation.R @@ -11,10 +11,10 @@ texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, table=FALSE, ci.force = TRUE) readme_groupings <- read.csv('../final_data/deb_readme_interaction_groupings.csv') -contrib_groupings <- read.csv('../final_data/0711_contrib_inter_groupings.csv') +contrib_groupings <- read.csv('../final_data/deb_contrib_interaction_groupings.csv') subdirColors <- - setNames( c('firebrick1', 'forestgreen', 'cornflowerblue') + setNames( c('#31449c', '#4a7c85', '#c5db68') , c(0,1,2) ) readme_g <- readme_groupings |> @@ -22,16 +22,18 @@ readme_g <- readme_groupings |> geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors) + guides(fill="none", color="none")+ - theme_bw() + theme_bw() + + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") readme_g contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + - scale_color_manual(values = subdirColors) + + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + - theme(legend.position = "top") + theme(legend.position = "top") + + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g library(gridExtra) diff --git a/R/readmeRDDAnalysis.R b/R/readmeRDDAnalysis.R index bd9b6a7..c5a3d65 100644 --- a/R/readmeRDDAnalysis.R +++ b/R/readmeRDDAnalysis.R @@ -47,6 +47,10 @@ mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg") all_actions_data$logged_count <- log(all_actions_data$count) all_actions_data$log1p_count <- log1p(all_actions_data$count) range(all_actions_data$log1p_count) + +grouped_averages <- aggregate(all_actions_data$count, list(all_actions_data$upstream_vcs_link), mean) +quantile(grouped_averages$x) + # 3 rdd in lmer analysis # rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design # lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc @@ -57,14 +61,16 @@ library(lattice) #some more EDA to go between Poisson and neg binomial var(all_actions_data$log1p_count) # 1.125429 mean (all_actions_data$log1p_count) # 0.6426873 +sd(all_actions_data$log1p_count) median(all_actions_data$log1p_count) #0 var(all_actions_data$count) # 268.4449 mean (all_actions_data$count) # 3.757298 +sd (all_actions_data$count) median(all_actions_data$count) # 0 print("fitting model") #all_log1p_gmodel <- glmer.nb(log1p_count ~ D * week_offset+ scaled_project_age + scaled_event_gap + (D * week_offset | upstream_vcs_link), data=all_actions_data, nAGQ=1, control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=1e5))) -all_log1p_gmodel <- readRDS("final_models/0624_readme_all_rdd.rda") +#all_log1p_gmodel <- readRDS("final_models/0624_readme_all_rdd.rda") summary(all_log1p_gmodel) print("model fit") #I grouped the ranef D effects on 0624 diff --git a/R/readme_docChar_outcomes.R b/R/readme_docChar_outcomes.R index bf2e4b8..e418f5e 100644 --- a/R/readme_docChar_outcomes.R +++ b/R/readme_docChar_outcomes.R @@ -1,5 +1,6 @@ #libraries library(stringr) +library(tidyverse) readme_df <- read_csv("../final_data/deb_readme_did.csv") readme_pop_df <- read_csv("../final_data/deb_readme_pop_change.csv") readme_readability_df <- read_csv('../text_analysis/dwo_readability_readme.csv') @@ -94,9 +95,11 @@ readme_total_df <- readme_total_df|> readme_total_df$commit_by_contrib = NA readme_total_df$commit_by_contrib = readme_total_df$summed_count * (readme_total_df$after_contrib_new + 1) readme_total_df$logged_outcome = log1p(readme_total_df$commit_by_contrib) +readme_total_df$logged_contribs = log1p(readme_total_df$after_contrib_new) +readme_total_df$logged_commits= log1p(readme_total_df$summed_count) # test regressions library(MASS) -lm1 <- glm.nb(logged_outcome~ reading_time + linsear_write_formula + flesch_reading_ease + mcalpine_eflaw + word_count, data = readme_total_df) +lm1 <- glm.nb(logged_commits~ word_count, data = readme_total_df) qqnorm(residuals(lm1)) summary(lm1) diff --git a/R/readme_topic_outcomes.R b/R/readme_topic_outcomes.R index 1d1882d..67099df 100644 --- a/R/readme_topic_outcomes.R +++ b/R/readme_topic_outcomes.R @@ -1,7 +1,8 @@ library(stringr) -library(plyr) +library(tidyverse) readme_topics_df <- read_csv("../text_analysis/readme_file_topic_distributions.csv") +colMeans(subset(readme_topics_df, select = -filename)) readme_df <- read_csv("../final_data/deb_readme_did.csv") readme_pop_df <- read_csv("../final_data/deb_readme_pop_change.csv") @@ -72,9 +73,21 @@ readme_total_df <- readme_total_df|> #outcome variable that is number of commits by number of new readmeutors readme_total_df$commit_by_contrib = readme_total_df$summed_count *readme_total_df$after_contrib_new -readme_total_df$logged_outcome = log1p(readme_total_df$commit_by_readme) +readme_total_df$logged_outcome = log1p(readme_total_df$commit_by_contrib) +readme_total_df$logged_contrib = log1p(readme_total_df$after_contrib_new) +readme_total_df$logged_commits = log1p(readme_total_df$summed_count) #running regressions library(MASS) -lm1 <- glm.nb(commit_by_contrib ~ t0+t1+t2+t7+t3 +t4 + t5, data = readme_total_df) +lm1 <- glm.nb(logged_contrib~ t0+t1+t2+t7+t3 +t6 + t5, data = readme_total_df) qqnorm(residuals(lm1)) summary(lm1) +#saveRDS(lm1, "0725_topic_contriboutcome_readme.rda") +contrib_ <- glm.nb(logged_contrib~ t0+t1+t2+t3+ t5 +t6 +t7, data = readme_total_df) +commit_ <- glm.nb(logged_commits~ t0+t1+t2+t3+ t5 +t6 +t7, data = readme_total_df) + +library(texreg) + +texreg(list(contrib_, commit_), stars=NULL, digits=3, use.packages=FALSE, + custom.model.names=c( 'Contributions','Commits'), + custom.coef.names=c('(Intercept)', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 6', 'Topic 7', 'Topic 8'), + table=FALSE, ci.force = TRUE) diff --git a/R/topic_distributions.R b/R/topic_distributions.R new file mode 100644 index 0000000..3bb2ce6 --- /dev/null +++ b/R/topic_distributions.R @@ -0,0 +1,42 @@ +library(stringr) +library(tidyverse) +readme_topics_df <- read_csv("../text_analysis/readme_file_topic_distributions.csv") +contrib_topics_df <- read_csv("../text_analysis/contrib_file_topic_distributions.csv") +quantile(contrib_topics_df$t0) +confint(contrib_topics_df$t0) +sd(contrib_topics_df$t0) +quantile(contrib_topics_df$t1) +mean(contrib_topics_df$t1) +sd(contrib_topics_df$t1) +quantile(contrib_topics_df$t2) +mean(contrib_topics_df$t2) +sd(contrib_topics_df$t2) +quantile(contrib_topics_df$t3) +mean(contrib_topics_df$t3) +sd(contrib_topics_df$t3) +quantile(readme_topics_df$t0) +mean(readme_topics_df$t0) +sd(readme_topics_df$t0) +quantile(readme_topics_df$t1) +mean(readme_topics_df$t1) +sd(readme_topics_df$t1) +quantile(readme_topics_df$t2) +mean(readme_topics_df$t2) +sd(readme_topics_df$t2) +quantile(readme_topics_df$t3) +mean(readme_topics_df$t3) +sd(readme_topics_df$t3) +quantile(readme_topics_df$t4) +mean(readme_topics_df$t4) +sd(readme_topics_df$t4) +quantile(readme_topics_df$t5) +mean(readme_topics_df$t5) +sd(readme_topics_df$t5) +quantile(readme_topics_df$t6) +mean(readme_topics_df$t6) +sd(readme_topics_df$t6) +quantile(readme_topics_df$t7) +mean(readme_topics_df$t7) +sd(readme_topics_df$t7) + + diff --git a/main.py b/main.py index e6465f8..3b7b8e5 100644 --- a/main.py +++ b/main.py @@ -103,17 +103,3 @@ if __name__ == "__main__": -''' -if largest_object[repo_path]["gha_obj"]['milestone_count'] == 0: - #del largest_object[repo_path] - #return - #this is to ensure that projects which don't use milestones are counted - largest_object[repo_path]["gha_obj"]['milestone_count'] = 0.1 -largest_object[repo_path]['new_mmt'] = compute_new_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) -#print('New MMT: ' + str(largest_object[repo_path]['new_mmt'])) -largest_object[repo_path]['old_mmt'] = compute_old_mmt(largest_object[repo_path]["perceval_obj"]['contributors'], largest_object[repo_path]["perceval_obj"]['collaborators']) -#print('Old MMT: ' + str(largest_object[repo_path]['old_mmt'])) -#new mmt formality score -largest_object[repo_path]['new_formality'] = compute_formality_score(largest_object[repo_path]['new_mmt'], largest_object[repo_path]["gha_obj"]['milestone_count'], largest_object[repo_path]["perceval_obj"]['age_of_project']) -print(largest_object[repo_path]['new_formality']) -'''