theme_bw() test_glmer_ranef_D |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + theme_bw() summary(all_gmodel) all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data) summary(all_gmodel) test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] has_zero <- function(estimate, low, high){ return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) } test_glmer_ranef_D <- test_glmer_ranef_D |> mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> mutate(rank = rank(estimate)) test_glmer_ranef_D |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + theme_bw() View(test_glmer_ranef_D) View(test_condvals) all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data) summary(all_gmodel) test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) View(test_condvals) all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Poisson) all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = poisson) summary(all_gmodel) all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family = poisson) summary(all_gmodel) test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] has_zero <- function(estimate, low, high){ return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) } test_glmer_ranef_D <- test_glmer_ranef_D |> mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> mutate(rank = rank(estimate)) test_glmer_ranef_D |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + theme_bw() all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = poisson) all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) summary(all_gmodel) test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] has_zero <- function(estimate, low, high){ return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) } test_glmer_ranef_D <- test_glmer_ranef_D |> mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> mutate(rank = rank(estimate)) test_glmer_ranef_D |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + theme_bw() variance(all_actions_data$log1p_count) var(all_actions_data$log1p_count) mean (all_actions_data$log1p_count) #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link),data=all_actions_data) #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), data=all_actions_data) #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)), data=all_actions_data) summary(all_gmodel) test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] has_zero <- function(estimate, low, high){ return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) } test_glmer_ranef_D <- test_glmer_ranef_D |> mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> mutate(rank = rank(estimate)) test_glmer_ranef_D |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + theme_bw() #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), # control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=2e5)), data=all_actions_data) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data) #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), # control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=2e5)), data=all_actions_data) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data, verbose=TRUE) #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), # control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=2e5)), data=all_actions_data) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data) #all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson) #all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), # control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=2e5)), data=all_actions_data) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data) library(tidyverse) library(plyr) library(stringr) try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) #load in data contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") #some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> pivot_longer(cols = ends_with("new"), names_to = "window", values_to = "count") |> unnest(count) |> mutate(after_doc = as.numeric(str_detect(window, "after"))) |> mutate(is_collab = as.numeric(str_detect(window, "collab"))) return(longer) } expanded_readme_data <- expand_timeseries(readme_df[1,]) for (i in 2:nrow(readme_df)){ expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) } expanded_contrib_data <- expand_timeseries(contrib_df[1,]) for (i in 2:nrow(contrib_df)){ expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) } expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) expanded_readme_data$logcount <- log(expanded_readme_data$count) expanded_contrib_data$logcount <- log(expanded_contrib_data$count) #breaking out the types of population counts collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] #import models library(lme4) library(optimx) collab_readme_model <- lmer(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme, REML=FALSE) collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme) summary(collab_readme_model) crm_residuals <- residuals(collab_readme_model) qqnorm(crm_residuals) collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_readme) summary(collab_readme_model) crm_residuals <- residuals(collab_readme_model) qqnorm(crm_residuals) saveRDS(collab_readme_model, "0510_pop_rm_collab.rda") contrib_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_readme) summary(contrib_readme_model) saveRDS(contrib_readme_model, "0510_pop_rm_contrib.rda") collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib) summary(collab_contrib_model) saveRDS(collab_contrib_model, "0510_pop_contrib_collab.rda") contrib_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_contrib) summary(contrib_contrib_model) saveRDS(contrib_contrib_model, "0510_pop_contrib_contrib.rda") summary(collab_readme_model) summary(contrib_readme_model) qqnorm(crm_residuals) conrm_residuals <- residuals(contrib_readme_model) qqnorm(conrm_residuals) summary(collab_contrib_model) summary(contrib_contrib_model) library(ggplot2) expanded_readme_data |> ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) + geom_point() + geom_jitter() expanded_readme_data |> ggplot(aes(x = after_doc, y = count, col = as.factor(is_collab))) + geom_point() + geom_jitter() expanded_readme_data |> ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) + geom_point() + geom_jitter() #primary analysis for cross-sectional community metrics overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) mean(overall_data$mmt) hist(overall_data$mmt, probability = TRUE) #age_vector <- overall_data$age_of_project/365 #quantile(age_vector) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) overall_data$scaled_age <- scale(overall_data$age_of_project) mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) summary(mmtmodel1) qqnorm(residuals(mmtmodel1)) summary(mmtmodel1) octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) g4 <- ggplot(octo_data) g4 #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) summary(octo_mmtmodel1) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data) summary(octo_mmtmodel1) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data) summary(issue_mmtmodel1) sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data) summary(wiki_mmtmodel1) qqnorm(residuals(issue_mmtmodel1)) qqnorm(residuals(wiki_mmtmodel1)) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), use.packages=FALSE, table=FALSE, ci.force = TRUE) library(texreg) #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), use.packages=FALSE, table=FALSE, ci.force = TRUE) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'), use.packages=FALSE, table=FALSE, ci.force = TRUE) summary(octo_mmtmodel1) summary(wiki_mmtmodel1) #left skewed data, need to transform sum(is.na(octo_data$wiki_mmt)) #left skewed data, need to transform sum(is.na(octo_data$issue_mmt)) #left skewed data, need to transform sum(is.na(octo_data$mmt)) test_frame <- na.omit(octo_data) #left skewed data, need to transform sum(is.na(octo_data$issue_contrib_count)) #left skewed data, need to transform sum(is.na(octo_data$wiki_contrib_count)) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) #left skewed data, need to transform typeof(octo_data$wiki_contrib_count) View(octo_data) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count)) / (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count + octo_data$issue_contrib_count) sum(is.na(octo_data$issue_mmt)) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) sum(is.na(octo_data$issue_mmt)) sum(octo_data$total_contrib == 0) #clean octo data octo_data <- filter(octo_data, total_contrib == 0) sum(octo_data$total_contrib == 0) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) #clean octo data octo_data <- filter(octo_data, total_contrib != 0) octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) #right skewed data, need to transform octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data) summary(octo_mmtmodel1) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data) qqnorm(residuals(issue_mmtmodel1)) sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data) summary(wiki_mmtmodel1) qqnorm(residuals(wiki_mmtmodel1)) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'), use.packages=FALSE, table=FALSE, ci.force = TRUE) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'Issue MMT', 'Wiki MMT'), use.packages=FALSE, table=FALSE, ci.force = TRUE) qqnorm(residuals(wiki_mmtmodel1)) View(octo_data) #TODO: find the overlap between projects with octo data and projects with readmes or contributings readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) octo_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) View(octo_data) octo_data <- octo_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) View(octo_data) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$has_readme) cor.test(octo_data$mmt, octo_data$has_contributing) cor.test(octo_data$mmt, octo_data$has_contrib) issues_expansion <- lm(issue_mmt ~ has_readme + scaled_age, data=octo_data) summary(issues_expansion) issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data) summary(issues_expansion) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$scaled_age) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$scaled_age) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data) octo_data <- octo_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(octo_mmtmodel1) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(issue_mmtmodel1) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(wiki_mmtmodel1) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), use.packages=FALSE, table=FALSE, ci.force = TRUE) qqnorm(residuals(issue_mmtmodel1)) qqnorm(residuals(wiki_mmtmodel1)) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$issue_mmt) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$wiki_mmt) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$has_readme) cor.test(octo_data$has_readme, octo_data$has_contrib) library(readr) library(ggplot2) library(tidyverse) #primary analysis for cross-sectional community metrics overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) mean(overall_data$mmt) hist(overall_data$mmt, probability = TRUE) mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) overall_data$scaled_age <- scale(overall_data$age_of_project) mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) summary(mmtmodel1) #clean octo data octo_data <- filter(octo_data, total_contrib != 0) octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) #right skewed data, need to transform octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) #find the overlap between projects with octo data and projects with readmes or contributings readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) octo_data <- octo_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(octo_mmtmodel1) mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) summary(mmt_outcome_model) mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) summary(mmt_outcome_model) mmt_outcome_model <- lm(wiki_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) summary(mmt_outcome_model) mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) summary(mmt_outcome_model) mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) summary(mmt_outcome_model) overall_data <- overall_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) all_mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = overall_data) summary(all_mmt_outcome_model) #pulling in the group data for the ranef coefficients rm_grouping <- read_csv('../051224_readme_grouped.csv',show_col_types = FALSE) #pulling in the group data for the ranef coefficients rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) View(contrib_grouping) View(rm_grouping) View(readme_did_roster) grouped_rm <- left_join(rm_grouping, overall_data, by = c("level","upstream_vcs_link")) rm_grouping <- rm_grouping |> rename(upstream_vcs_link = level) View(rm_grouping) grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") View(grouped_rm) contrib_grouping <- contrib_grouping |> rename(upstream_vcs_link = level) grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link") View(grouped_rm) #analyses cor.test(grouped_rm$mmt, grouped_rm$ranef_grouping) cor.test(grouped_contrib$mmt, grouped_contrib$ranef_grouping) #analyses cor.test(grouped_rm$underproduction_mean, grouped_rm$ranef_grouping) cor.test(grouped_contrib$underproduction_mean, grouped_contrib$ranef_grouping) #analyses cor.test(grouped_rm$underproduction_mean, grouped_rm$estimate) cor.test(grouped_contrib$underproduction_mean, grouped_contrib$estimate) View(grouped_rm) #test with linear model grouping_model <- lm(underproduction_mean ~ estimate + scaled_age, data=grouped_rm) summary(grouping_model) #test with linear model grouping_model <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm) summary(grouping_model) #test with linear model grouping_model <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm) summary(grouping_model) grouping_model_contrib <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) #test with linear model grouping_model_rm <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm) summary(grouping_model_rm) grouping_model_contrib <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) #test with linear model grouping_model_rm <- glm.nb(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm) #pulling in the group data for the ranef coefficients rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) rm_grouping <- rm_grouping |> rename(upstream_vcs_link = level)|> mutate(factored_group = as.factor(ranef_grouping)) contrib_grouping <- contrib_grouping |> rename(upstream_vcs_link = level) |> mutate(factored_group = as.factor(ranef_grouping)) grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link") #analyses cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group) #test with linear model grouping_model_rm <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_rm) summary(grouping_model_rm) grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) summary(grouping_model_rm) grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) qqnorm(residuals(grouping_model_rm)) qqnorm(residuals(grouping_model_contrib)) rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE) contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE) grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link") grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link") #calculate in terms of July 6, 2020 typeof(event_date) #calculate in terms of July 6, 2020 typeof(grouped_rm$event_date) #calculate in terms of July 6, 2020 typeof(as.Date(grouped_rm$event_date)) how_long_has_file <- as.Date("2020-07-06") - as.Date(grouped_rm$event_date) how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date)) how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days") #calculate in terms of July 6, 2020 grouped_rm$event_date #calculate in terms of July 6, 2020 dates <- as.POSIXct(grouped_rm$event_date,tz="UTC") dates typeof(dates) how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days") #calculate in terms of July 6, 2020 dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' '))) #calculate in terms of July 6, 2020 dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' '))) #calculate in terms of July 6, 2020 dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) View(dtparts) thetimes = chron(dates=dtparts[,1],times=dtparts[,2], + format=c('y-m-d','h:m:s')) thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) #calculate in terms of July 6, 2020 library(chron) dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) typeof(thetimes) grouped_rm <- grouped_rm |> mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |> mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) View(grouped_rm) #test with linear model grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm) summary(grouping_model_rm) #now doing it for the contrib_data contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' '))) grouped_contrib <- grouped_contrib |> mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |> mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) summary(grouping_model_rm) qqnorm(residuals(grouping_model_rm)) grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) qqnorm(residuals(grouping_model_contrib)) qqnorm(residuals(grouping_model_rm)) qqnorm(residuals(grouping_model_contrib)) issues_expansion <- lm(issue_mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data) summary(issues_expansion) govdoc_mmt <- lm(mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data) summary(govdoc_mmt) govdoc_mmt <- lm(mmt ~ as.factor(has_readme) + scaled_age, data=octo_data) summary(govdoc_mmt) govdoc_issuesmmt <- lm(issue_mmt ~ as.factor(has_readme) + scaled_age, data=octo_data) summary(govdoc_issuesmmt) mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) summary(mmt_outcome_model) all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data) summary(all_mmt_outcome_model) govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data) summary(govdoc_issuesmmt)