updates to some of the R scripts

2024-06-19 19:40:22 -04:00 · 2024-06-19 19:40:22 -04:00 · 379a8929a5
commit 379a8929a5
parent b48a684185
7 changed files with 6287 additions and 3518 deletions
--- a/R/.RData
+++ b/R/.RData
--- a/R/.Rhistory
+++ b/R/.Rhistory
@ -1,330 +1,450 @@
-theme_bw()
+mutate(crescendo_limit = ifelse(week_offset < (-4), 0, 1))|>
-test_glmer_ranef_D |>
+cor.test(crescendo_limit, count)
-ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
+#checking crescendo of contributions before document publication
-theme_bw()
+#second window
-summary(all_gmodel)
+second_windowed_data <- windowed_data |>
-all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data)
+filter(week_offset <= 0) |>
-summary(all_gmodel)
+mutate(crescendo_limit = ifelse(week_offset < (-2), 0, 1))
-test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
+#checking crescendo of contributions before document publication
-has_zero <- function(estimate, low, high){
+#second window
-return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
+second_windowed_data <- windowed_data |>
-}
+filter(week_offset <= 0) |>
-test_glmer_ranef_D <- test_glmer_ranef_D |>
+mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-mutate(rank = rank(estimate))
+#checking crescendo of contributions before document publication
-test_glmer_ranef_D |>
+#second window
-ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
+second_windowed_data <- all_actions_data |>
-geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
+filter(week_offset <= 0) |>
-theme_bw()
+mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-View(test_glmer_ranef_D)
+#testing whether there's a correlation between count and the presce
-View(test_condvals)
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data)
 summary(all_gmodel)
 test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
 View(test_condvals)
 all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Poisson)
 all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = poisson)
 summary(all_gmodel)
 all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family = poisson)
 summary(all_gmodel)
 test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
 test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
 has_zero <- function(estimate, low, high){
 return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
 }
 test_glmer_ranef_D <- test_glmer_ranef_D |>
 mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
 mutate(rank = rank(estimate))
 test_glmer_ranef_D |>
 ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
 geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
 theme_bw()
 all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = poisson)
 all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 summary(all_gmodel)
 test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
 test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
 has_zero <- function(estimate, low, high){
 return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
 }
 test_glmer_ranef_D <- test_glmer_ranef_D |>
 mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
 mutate(rank = rank(estimate))
 test_glmer_ranef_D |>
 ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
 geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
 theme_bw()
 variance(all_actions_data$log1p_count)
 var(all_actions_data$log1p_count)
 mean (all_actions_data$log1p_count)
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link),data=all_actions_data)
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link),
 control=glmerControl(optimizer="bobyqa",
 optCtrl=list(maxfun=2e5)), data=all_actions_data)
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link),
 control=glmerControl(optimizer="bobyqa",
 optCtrl=list(maxfun=2e5)), data=all_actions_data)
 summary(all_gmodel)
 test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
 test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
 has_zero <- function(estimate, low, high){
 return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
 }
 test_glmer_ranef_D <- test_glmer_ranef_D |>
 mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
 mutate(rank = rank(estimate))
 test_glmer_ranef_D |>
 ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
 geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
 theme_bw()
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
 #                       control=glmerControl(optimizer="bobyqa",
 #                                            optCtrl=list(maxfun=2e5)), data=all_actions_data)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data)
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
 #                       control=glmerControl(optimizer="bobyqa",
 #                                            optCtrl=list(maxfun=2e5)), data=all_actions_data)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data, verbose=TRUE)
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
 #                       control=glmerControl(optimizer="bobyqa",
 #                                            optCtrl=list(maxfun=2e5)), data=all_actions_data)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)  | upstream_vcs_link), data=all_actions_data)
 #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
 #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
 #                       control=glmerControl(optimizer="bobyqa",
 #                                            optCtrl=list(maxfun=2e5)), data=all_actions_data)
 all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data)
 library(tidyverse)
 library(plyr)
-library(stringr)
+#get the contrib data instead
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-#load in data
+contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
-contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
+#some preprocessing and expansion
-readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
+col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
-#some expansion needs to happens for each project
+contrib_df <- contrib_df[,col_order]
 contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
 contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
 contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
 contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
 drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
 contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
 # 2 some expansion needs to happens for each project
 expand_timeseries <- function(project_row) {
 longer <- project_row |>
-pivot_longer(cols = ends_with("new"),
+pivot_longer(cols = starts_with("ct"),
 names_to = "window",
 values_to = "count") |>
-unnest(count) |>
+unnest(count)
-mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
+longer$observation_type <- gsub("^.*_", "", longer$window)
-mutate(is_collab = as.numeric(str_detect(window, "collab")))
+longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
 longer$count <- as.numeric(longer$count)
 #longer <- longer[which(longer$observation_type == "all"),]
 return(longer)
 }
-expanded_readme_data <- expand_timeseries(readme_df[1,])
+expanded_data <- expand_timeseries(contrib_df[1,])
 for (i in 2:nrow(readme_df)){
 expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
 }
 expanded_contrib_data <- expand_timeseries(contrib_df[1,])
 for (i in 2:nrow(contrib_df)){
-expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
+expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
 }
-expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
+#filter out the windows of time that we're looking at
-expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
+window_num <- 8
-expanded_readme_data$logcount <- log(expanded_readme_data$count)
+windowed_data <- expanded_data |>
-expanded_contrib_data$logcount <- log(expanded_contrib_data$count)
+filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
-#breaking out the types of population counts
+mutate(D = ifelse(week > 27, 1, 0))
-collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
+#scale the age numbers and calculate the week offset here
-contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
+windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
-collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
+windowed_data$week_offset <- windowed_data$week - 27
-contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
+#break out the different type of commit actions
-#import models
+all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
-library(lme4)
+mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
-library(optimx)
+#logging
-collab_readme_model <- lmer(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme, REML=FALSE)
+all_actions_data$logged_count <- log(all_actions_data$count)
-collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme)
+all_actions_data$log1p_count <- log1p(all_actions_data$count)
-summary(collab_readme_model)
+# now for merge
-crm_residuals <- residuals(collab_readme_model)
+mrg_actions_data$logged_count <- log(mrg_actions_data$count)
-qqnorm(crm_residuals)
+mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
-collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_readme)
+#checking crescendo of contributions before document publication
-summary(collab_readme_model)
+#second window
-crm_residuals <- residuals(collab_readme_model)
+second_windowed_data <- all_actions_data |>
-qqnorm(crm_residuals)
+filter(week_offset <= 0) |>
-saveRDS(collab_readme_model, "0510_pop_rm_collab.rda")
+mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-contrib_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_readme)
+#testing whether there's a correlation between count and the two weeks before the introduction
-summary(contrib_readme_model)
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-saveRDS(contrib_readme_model, "0510_pop_rm_contrib.rda")
+#checking crescendo of contributions before document publication
-collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib)
+#second window
-summary(collab_contrib_model)
+second_windowed_data <- all_actions_data |>
-saveRDS(collab_contrib_model, "0510_pop_contrib_collab.rda")
+filter(week_offset <= 0) |>
-contrib_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_contrib)
+mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
-summary(contrib_contrib_model)
+#testing whether there's a correlation between count and the two weeks before the introduction
-saveRDS(contrib_contrib_model, "0510_pop_contrib_contrib.rda")
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-summary(collab_readme_model)
+#checking crescendo of contributions before document publication
-summary(contrib_readme_model)
+#second window
-qqnorm(crm_residuals)
+second_windowed_data <- all_actions_data |>
-conrm_residuals <- residuals(contrib_readme_model)
+filter(week_offset <= 0) |>
-qqnorm(conrm_residuals)
+mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-summary(collab_contrib_model)
+#testing whether there's a correlation between count and the two weeks before the introduction
-summary(contrib_contrib_model)
+cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 # this is the file with the lmer multi-level rddAnalysis
 library(tidyverse)
 library(plyr)
 # 0 loading the readme data in
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 readme_df <- read_csv("../final_data/deb_readme_did.csv")
 # 1 preprocessing
 #colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
 col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
 readme_df <- readme_df[,col_order]
 readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
 readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
 readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
 readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
 drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
 readme_df = readme_df[,!(names(readme_df) %in% drop)]
 # 2 some expansion needs to happens for each project
 expand_timeseries <- function(project_row) {
 longer <- project_row |>
 pivot_longer(cols = starts_with("ct"),
 names_to = "window",
 values_to = "count") |>
 unnest(count)
 longer$observation_type <- gsub("^.*_", "", longer$window)
 longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
 longer$count <- as.numeric(longer$count)
 #longer <- longer[which(longer$observation_type == "all"),]
 return(longer)
 }
 expanded_data <- expand_timeseries(readme_df[1,])
 for (i in 2:nrow(readme_df)){
 expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
 }
 #filter out the windows of time that we're looking at
 window_num <- 8
 windowed_data <- expanded_data |>
 filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
 mutate(D = ifelse(week > 27, 1, 0))
 #scale the age numbers
 windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
 windowed_data$week_offset <- windowed_data$week - 27
 #break out the different types of commit actions that are studied
 all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
 mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
 #log the dependent
 all_actions_data$logged_count <- log(all_actions_data$count)
 all_actions_data$log1p_count <- log1p(all_actions_data$count)
 #checking crescendo of contributions before document publication
 #second window
 second_windowed_data <- all_actions_data |>
 filter(week_offset <= 0) |>
 mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
 #testing whether there's a correlation between count and the two weeks before the introduction
 cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
 crescendow_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
 crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 #checking crescendo of contributions before document publication
 #second window
 second_windowed_data <- all_actions_data |>
 filter(week_offset <= 0) |>
 mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
 #testing whether there's a correlation between count and the two weeks before the introduction
 cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 library(tidyverse)
 library(plyr)
 #get the contrib data instead
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
 #some preprocessing and expansion
 col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
 contrib_df <- contrib_df[,col_order]
 contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
 contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
 contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
 contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
 drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
 contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
 # 2 some expansion needs to happens for each project
 expand_timeseries <- function(project_row) {
 longer <- project_row |>
 pivot_longer(cols = starts_with("ct"),
 names_to = "window",
 values_to = "count") |>
 unnest(count)
 longer$observation_type <- gsub("^.*_", "", longer$window)
 longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
 longer$count <- as.numeric(longer$count)
 #longer <- longer[which(longer$observation_type == "all"),]
 return(longer)
 }
 expanded_data <- expand_timeseries(contrib_df[1,])
 for (i in 2:nrow(contrib_df)){
 expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
 }
 #filter out the windows of time that we're looking at
 window_num <- 8
 windowed_data <- expanded_data |>
 filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
 mutate(D = ifelse(week > 27, 1, 0))
 #scale the age numbers and calculate the week offset here
 windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
 windowed_data$week_offset <- windowed_data$week - 27
 #break out the different type of commit actions
 all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
 mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
 #logging
 all_actions_data$logged_count <- log(all_actions_data$count)
 all_actions_data$log1p_count <- log1p(all_actions_data$count)
 # now for merge
 mrg_actions_data$logged_count <- log(mrg_actions_data$count)
 mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
 #checking crescendo of contributions before document publication
 #second window
 second_windowed_data <- all_actions_data |>
 filter(week_offset <= 0) |>
 mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
 #testing whether there's a correlation between count and the two weeks before the introduction
 cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 # this is the file with the lmer multi-level rddAnalysis
 library(tidyverse)
 library(plyr)
 # 0 loading the readme data in
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 readme_df <- read_csv("../final_data/deb_readme_did.csv")
 # 1 preprocessing
 #colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
 col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
 readme_df <- readme_df[,col_order]
 readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
 readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
 readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
 readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
 drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
 readme_df = readme_df[,!(names(readme_df) %in% drop)]
 # 2 some expansion needs to happens for each project
 expand_timeseries <- function(project_row) {
 longer <- project_row |>
 pivot_longer(cols = starts_with("ct"),
 names_to = "window",
 values_to = "count") |>
 unnest(count)
 longer$observation_type <- gsub("^.*_", "", longer$window)
 longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
 longer$count <- as.numeric(longer$count)
 #longer <- longer[which(longer$observation_type == "all"),]
 return(longer)
 }
 expanded_data <- expand_timeseries(readme_df[1,])
 for (i in 2:nrow(readme_df)){
 expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
 }
 #filter out the windows of time that we're looking at
 window_num <- 8
 windowed_data <- expanded_data |>
 filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
 mutate(D = ifelse(week > 27, 1, 0))
 #scale the age numbers
 windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
 windowed_data$week_offset <- windowed_data$week - 27
 #break out the different types of commit actions that are studied
 all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
 mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
 #log the dependent
 all_actions_data$logged_count <- log(all_actions_data$count)
 all_actions_data$log1p_count <- log1p(all_actions_data$count)
 #checking crescendo of contributions before document publication
 #second window
 second_windowed_data <- all_actions_data |>
 filter(week_offset <= 0) |>
 mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
 #testing whether there's a correlation between count and the two weeks before the introduction
 cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 #checking crescendo of contributions before document publication
 #second window
 second_windowed_data <- all_actions_data |>
 filter(week_offset <= 0) |>
 mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
 #testing whether there's a correlation between count and the two weeks before the introduction
 cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 #checking crescendo of contributions before document publication
 #second window
 second_windowed_data <- all_actions_data |>
 filter(week_offset <= 0) |>
 mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
 #testing whether there's a correlation between count and the two weeks before the introduction
 cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
 crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
 summary(crescendo_huh)
 library(tidyverse)
 library(plyr)
 # script for the analysis of document readability metrics
 # readability metrics will be studied controlled by their length
 # gaughan@u.northwestern.edu
 # loading in the data
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
 contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
 head(readme_df)
 readme_df <- readme_df |>
 mutate(coef_grouping <- as.factor(subdir))
 cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
 readme_df <- readme_df |>
 mutate(coef_grouping <- as.factor(subdir))
 cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
 cor(readme_df$coef_grouping, readme_df$flesch_reading_ease)
 readme_df <- readme_df |>
 mutate(coef_grouping <- as.factor(subdir))
 test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
 readme_df <- readme_df |>
 mutate(coef_grouping <- as.factor(subdir))
 test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
 test_lm <- lm(flesch_reading_ease ~ subdir,data=readme_df)
 summary(test_lm)
 test_lm <- lm(flesch_reading_ease ~ as.factor(subdir),data=readme_df)
 summary(test_lm)
 head(readme_df)
 test_lm <- lm(flesch_reading_ease ~ char_count + as.factor(subdir),data=readme_df)
 summary(test_lm)
 head(readme_df)
 test_lm <- lm(linsear_write_formula ~ char_count + as.factor(subdir),data=readme_df)
 summary(test_lm)
 head(readme_df)
 test_lm <- lm(mcalpine_eflaw ~ char_count + as.factor(subdir),data=readme_df)
 summary(test_lm)
 test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
 summary(test_lm)
 aggregate(readme_df[, 3:11], list(readme_df$subdir), mean)
 aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
 #readme_df <- readme_df |>
 #  mutate(coef_grouping <- as.factor(subdir))
 #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
 #summary(test_lm)
 aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean)
 library(tidyverse)
 library(plyr)
 # script for the analysis of document readability metrics
 # readability metrics will be studied controlled by their length
 # gaughan@u.northwestern.edu
 # loading in the data
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
 contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
 head(readme_df)
 aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
 aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
 #readme_df <- readme_df |>
 #  mutate(coef_grouping <- as.factor(subdir))
 #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
 #summary(test_lm)
 aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
 rm(list=ls())
 set.seed(424242)
 library(readr)
 library(ggplot2)
-expanded_readme_data |>
+library(tidyverse)
-ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) +
+overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
 geom_point() + geom_jitter()
 expanded_readme_data |>
 ggplot(aes(x = after_doc, y = count, col = as.factor(is_collab))) +
 geom_point() + geom_jitter()
 expanded_readme_data |>
 ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) +
 geom_point() + geom_jitter()
 #primary analysis for cross-sectional community metrics
 overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
 octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
 readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
 contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
 overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
 mean(overall_data$mmt)
 hist(overall_data$mmt, probability = TRUE)
-#age_vector <- overall_data$age_of_project/365
+#the basic stuff for the overall data
-#quantile(age_vector)
+overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
-overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+mean(overall_data$mmt)
-table(overall_data$new.age)
+hist(overall_data$mmt, probability = TRUE)
-overall_data$new.age.factor <- as.factor(overall_data$new.age)
+#some new variables around age
 #overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
 #table(overall_data$new.age)
 #overall_data$new.age.factor <- as.factor(overall_data$new.age)
 overall_data$scaled_age <- scale(overall_data$age_of_project)
 #model
 mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
 summary(mmtmodel1)
 qqnorm(residuals(mmtmodel1))
-summary(mmtmodel1)
+# below this is the analysis for the octo data
-octo_data$scaled_age <- scale(octo_data$age_of_project)
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
-octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+table(octo_data$new.age)
-octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+octo_data$new.age.factor <- as.factor(octo_data$new.age)
 octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
 g4 <- ggplot(octo_data)
 g4
 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
 summary(octo_mmtmodel1)
 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data)
 summary(octo_mmtmodel1)
 issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
 summary(issue_mmtmodel1)
 sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data)
 wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data)
 summary(wiki_mmtmodel1)
 qqnorm(residuals(issue_mmtmodel1))
 qqnorm(residuals(wiki_mmtmodel1))
 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
 custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ),
 custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
 use.packages=FALSE, table=FALSE, ci.force = TRUE)
 library(texreg) #my little "lib"
 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
 custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ),
 custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
 use.packages=FALSE, table=FALSE, ci.force = TRUE)
 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
 custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ),
 custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'),
 use.packages=FALSE, table=FALSE, ci.force = TRUE)
 summary(octo_mmtmodel1)
 summary(wiki_mmtmodel1)
 #left skewed data, need to transform
 sum(is.na(octo_data$wiki_mmt))
 #left skewed data, need to transform
 sum(is.na(octo_data$issue_mmt))
 #left skewed data, need to transform
 sum(is.na(octo_data$mmt))
 test_frame <- na.omit(octo_data)
 #left skewed data, need to transform
 sum(is.na(octo_data$issue_contrib_count))
 #left skewed data, need to transform
 sum(is.na(octo_data$wiki_contrib_count))
 octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
 #left skewed data, need to transform
 typeof(octo_data$wiki_contrib_count)
 View(octo_data)
 octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count)) / (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count + octo_data$issue_contrib_count)
 sum(is.na(octo_data$issue_mmt))
 octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
 sum(is.na(octo_data$issue_mmt))
 sum(octo_data$total_contrib == 0)
 #clean octo data
 octo_data <- filter(octo_data, total_contrib == 0)
 sum(octo_data$total_contrib == 0)
 octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
 #clean octo data
 octo_data <- filter(octo_data, total_contrib != 0)
 octo_data$scaled_age <- scale(octo_data$age_of_project)
 octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
 mean(octo_data$mmt)
 hist(octo_data$mmt)
 head(octo_data)
 #getting the mmt-equivalent for both issue activity as well as wiki contrib activity
 octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
 octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
 #right skewed data, need to transform
 octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
 hist(octo_data$wiki_mmt)
 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data)
 summary(octo_mmtmodel1)
 issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
 issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
 qqnorm(residuals(issue_mmtmodel1))
 sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data)
 wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data)
 summary(wiki_mmtmodel1)
 qqnorm(residuals(wiki_mmtmodel1))
 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
 custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ),
 custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'),
 use.packages=FALSE, table=FALSE, ci.force = TRUE)
 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
 custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
 custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'Issue MMT', 'Wiki MMT'),
 use.packages=FALSE, table=FALSE, ci.force = TRUE)
 qqnorm(residuals(wiki_mmtmodel1))
 View(octo_data)
 #TODO: find the overlap between projects with octo data and projects with readmes or contributings
 readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
 contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
 octo_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link))
 View(octo_data)
 octo_data <- octo_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
 mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
 View(octo_data)
 #below here is the analysis for the readme.md data
 cor.test(octo_data$mmt, octo_data$has_readme)
 cor.test(octo_data$mmt, octo_data$has_contributing)
 cor.test(octo_data$mmt, octo_data$has_contrib)
 issues_expansion <- lm(issue_mmt ~ has_readme + scaled_age, data=octo_data)
 summary(issues_expansion)
 issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data)
 summary(issues_expansion)
 #below here is the analysis for the readme.md data
 cor.test(octo_data$mmt, octo_data$scaled_age)
 #below here is the analysis for the readme.md data
 cor.test(octo_data$mmt, octo_data$scaled_age)
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data)
 octo_data <- octo_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
 mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data)
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(octo_mmtmodel1)
 issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(issue_mmtmodel1)
 qqnorm(residuals(issue_mmtmodel1))
 wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(wiki_mmtmodel1)
 #getting some of the information in about whether projects have specific files
 readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
 contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
 octo_data <- octo_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
 mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
 overall_data <- overall_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
 mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(octo_mmtmodel1)
 issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(issue_mmtmodel1)
 qqnorm(residuals(issue_mmtmodel1))
 wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(wiki_mmtmodel1)
 qqnorm(residuals(wiki_mmtmodel1))
 #these next three are looking at mmt as an outcome of other factors
 mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
 summary(mmt_outcome_model)
 library(texreg) #my little "lib"
 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
 custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
 custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
 use.packages=FALSE, table=FALSE, ci.force = TRUE)
 govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
 summary(govdoc_issuesmmt)
 View(octo_data)
 octo_cleaned <- octo_data[octo_data$issue_mmt != NaN]
 octo_cleaned <- octo_data[!is.nan(octo_data$issue_mmt),]
 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
 summary(octo_mmtmodel1)
 issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
 summary(issue_mmtmodel1)
 qqnorm(residuals(issue_mmtmodel1))
-qqnorm(residuals(wiki_mmtmodel1))
+wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
-#below here is the analysis for the readme.md data
+summary(wiki_mmtmodel1)
-cor.test(octo_data$mmt, octo_data$issue_mmt)
+write.csv(octo_cleaned,"cleaned_octo.csv", row.names = FALSE)
-#below here is the analysis for the readme.md data
+texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
-cor.test(octo_data$mmt, octo_data$wiki_mmt)
+custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
-#below here is the analysis for the readme.md data
+custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
-cor.test(octo_data$mmt, octo_data$has_readme)
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
-cor.test(octo_data$has_readme, octo_data$has_contrib)
+rm(list=ls())
 set.seed(424242)
 library(readr)
 library(ggplot2)
 library(tidyverse)
@ -335,178 +455,58 @@ readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types =
 contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
 overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
 mean(overall_data$mmt)
 #the basic stuff for the overall data
 overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
 mean(overall_data$mmt)
 hist(overall_data$mmt, probability = TRUE)
-mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
+#model
 overall_data$scaled_age <- scale(overall_data$age_of_project)
 mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
 summary(mmtmodel1)
 qqnorm(residuals(mmtmodel1))
 #clean octo data
 octo_data <- filter(octo_data, total_contrib != 0)
 #some new variables around age
 #overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
 #table(overall_data$new.age)
 #overall_data$new.age.factor <- as.factor(overall_data$new.age)
 overall_data$scaled_age <- scale(overall_data$age_of_project)
 #model
 mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
 table(octo_data$new.age)
 octo_data$new.age.factor <- as.factor(octo_data$new.age)
 octo_data$scaled_age <- scale(octo_data$age_of_project)
 octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
 mean(octo_data$mmt)
 hist(octo_data$mmt)
 head(octo_data)
 #getting the mmt-equivalent for both issue activity as well as wiki contrib activity
 octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
 #right skewed data, need to transform
 octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
-octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
+#getting some of the information in about whether projects have specific files
 #find the overlap between projects with octo data and projects with readmes or contributings
 readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
 contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
 octo_data <- octo_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
 mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(octo_mmtmodel1)
 mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
 summary(mmt_outcome_model)
 mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
 summary(mmt_outcome_model)
 mmt_outcome_model <- lm(wiki_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
 summary(mmt_outcome_model)
 mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
 summary(mmt_outcome_model)
 mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
 summary(mmt_outcome_model)
 overall_data <- overall_data |>
 mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
 mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
-all_mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = overall_data)
+#below are the models for the octo data, there should be analysis for each one
-summary(all_mmt_outcome_model)
+octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-#pulling in the group data for the ranef coefficients
+summary(octo_mmtmodel1)
-rm_grouping <- read_csv('../051224_readme_grouped.csv',show_col_types = FALSE)
+issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-#pulling in the group data for the ranef coefficients
+summary(issue_mmtmodel1)
-rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
+qqnorm(residuals(issue_mmtmodel1))
-contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
+wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-View(contrib_grouping)
+summary(wiki_mmtmodel1)
-View(rm_grouping)
+library(texreg) #my little "lib"
-View(readme_did_roster)
+texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
-grouped_rm <- left_join(rm_grouping, overall_data, by = c("level","upstream_vcs_link"))
+custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
-rm_grouping <- rm_grouping |>
+custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
-rename(upstream_vcs_link = level)
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
-View(rm_grouping)
+#now large MMT model taking into account having contributing or README
-grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link")
+mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data)
-View(grouped_rm)
+summary(mmtmodel2)
-contrib_grouping <- contrib_grouping |>
+qqnorm(residuals(mmtmodel2))
-rename(upstream_vcs_link = level)
+summary(mmtmodel2)
 grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
 View(grouped_rm)
 #analyses
 cor.test(grouped_rm$mmt, grouped_rm$ranef_grouping)
 cor.test(grouped_contrib$mmt, grouped_contrib$ranef_grouping)
 #analyses
 cor.test(grouped_rm$underproduction_mean, grouped_rm$ranef_grouping)
 cor.test(grouped_contrib$underproduction_mean, grouped_contrib$ranef_grouping)
 #analyses
 cor.test(grouped_rm$underproduction_mean, grouped_rm$estimate)
 cor.test(grouped_contrib$underproduction_mean, grouped_contrib$estimate)
 View(grouped_rm)
 #test with linear model
 grouping_model <- lm(underproduction_mean ~ estimate + scaled_age, data=grouped_rm)
 summary(grouping_model)
 #test with linear model
 grouping_model <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm)
 summary(grouping_model)
 #test with linear model
 grouping_model <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm)
 summary(grouping_model)
 grouping_model_contrib <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_contrib)
 summary(grouping_model_contrib)
 #test with linear model
 grouping_model_rm <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm)
 summary(grouping_model_rm)
 grouping_model_contrib <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_contrib)
 summary(grouping_model_contrib)
 #test with linear model
 grouping_model_rm <- glm.nb(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm)
 #pulling in the group data for the ranef coefficients
 rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
 contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
 rm_grouping <- rm_grouping |>
 rename(upstream_vcs_link = level)|>
 mutate(factored_group = as.factor(ranef_grouping))
 contrib_grouping <- contrib_grouping |>
 rename(upstream_vcs_link = level) |>
 mutate(factored_group = as.factor(ranef_grouping))
 grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link")
 grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
 #analyses
 cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group)
 #test with linear model
 grouping_model_rm <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_rm)
 summary(grouping_model_rm)
 grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib)
 summary(grouping_model_contrib)
 summary(grouping_model_rm)
 grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib)
 summary(grouping_model_contrib)
 qqnorm(residuals(grouping_model_rm))
 qqnorm(residuals(grouping_model_contrib))
 rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE)
 contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE)
 grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link")
 grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link")
 #calculate in terms of July 6, 2020
 typeof(event_date)
 #calculate in terms of July 6, 2020
 typeof(grouped_rm$event_date)
 #calculate in terms of July 6, 2020
 typeof(as.Date(grouped_rm$event_date))
 how_long_has_file <- as.Date("2020-07-06") - as.Date(grouped_rm$event_date)
 how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date))
 how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
 #calculate in terms of July 6, 2020
 grouped_rm$event_date
 #calculate in terms of July 6, 2020
 dates <- as.POSIXct(grouped_rm$event_date,tz="UTC")
 dates
 typeof(dates)
 how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
 #calculate in terms of July 6, 2020
 dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' ')))
 #calculate in terms of July 6, 2020
 dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' ')))
 #calculate in terms of July 6, 2020
 dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
 View(dtparts)
 thetimes = chron(dates=dtparts[,1],times=dtparts[,2],
 +                  format=c('y-m-d','h:m:s'))
 thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
 #calculate in terms of July 6, 2020
 library(chron)
 dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
 thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
 typeof(thetimes)
 grouped_rm <- grouped_rm |>
 mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |>
 mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
 View(grouped_rm)
 #test with linear model
 grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm)
 summary(grouping_model_rm)
 #now doing it for the contrib_data
 contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' ')))
 grouped_contrib <- grouped_contrib |>
 mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |>
 mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
 grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
 summary(grouping_model_contrib)
 summary(grouping_model_rm)
 qqnorm(residuals(grouping_model_rm))
 grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
 summary(grouping_model_contrib)
 qqnorm(residuals(grouping_model_contrib))
 qqnorm(residuals(grouping_model_rm))
 qqnorm(residuals(grouping_model_contrib))
 issues_expansion <- lm(issue_mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data)
 summary(issues_expansion)
 govdoc_mmt <- lm(mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data)
 summary(govdoc_mmt)
 govdoc_mmt <- lm(mmt ~ as.factor(has_readme) + scaled_age, data=octo_data)
 summary(govdoc_mmt)
 govdoc_issuesmmt <- lm(issue_mmt ~ as.factor(has_readme) + scaled_age, data=octo_data)
 summary(govdoc_issuesmmt)
 mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
 summary(mmt_outcome_model)
 all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data)
 summary(all_mmt_outcome_model)
 govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
 summary(govdoc_issuesmmt)
--- a/R/GovRiskPower.R
+++ b/R/GovRiskPower.R
@ -10,7 +10,6 @@ overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALS
 octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
 readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
 contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
 overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
 mean(overall_data$mmt)
 hist(overall_data$mmt, probability = TRUE)
@ -61,7 +60,6 @@ overall_data <- overall_data |>
  mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
  mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
 summary(octo_mmtmodel1)
@ -74,15 +72,19 @@ wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme +
 summary(wiki_mmtmodel1)
 qqnorm(residuals(wiki_mmtmodel1))
-#these next three are looking at mmt as an outcome of other factors
+#now large MMT model taking into account having contributing or README
-mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
+mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data)
-summary(mmt_outcome_model)
+model_summarized <- summary(mmtmodel2)
 coef <- coef(mmtmodel2)
 qqnorm(residuals(mmtmodel2))
 overall_data$predict <- overall_data$mmt * model_summarized$coefficients[2,1] + model_summarized$coefficients[1,1]
 overall_data$SE <- predict.lm(mmtmodel2, se.fit = TRUE, level = 0.95)$se.fit
 p <- ggplot(data=overall_data, aes(x=mmt, y=underproduction_mean)) +
  geom_ribbon(aes(x=mmt, ymin= predict-SE, ymax = predict+SE), fill = "green") +
  geom_abline(intercept=coef[1], slope = coef[2], color = "red") +
  theme_bw()
 p
 all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data)
 summary(all_mmt_outcome_model)
 govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
 summary(govdoc_issuesmmt)
 library(texreg) #my little "lib"
--- a/R/documentReadabilityAnalysis.R
+++ b/R/documentReadabilityAnalysis.R
@ -8,9 +8,9 @@ try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
 contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
 head(readme_df)
-aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
+aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
 #readme_df <- readme_df |>
 #  mutate(coef_grouping <- as.factor(subdir))
 #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
 #summary(test_lm)
-aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean)
+aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
--- a/R/draft_large_lm_plot.png
+++ b/R/draft_large_lm_plot.png
--- a/final_data/deb_octo_data.csv
+++ b/final_data/deb_octo_data.csv
--- a/verbose_data/deb_octo_data.csv
+++ b/verbose_data/deb_octo_data.csv