From 38e845ec90c107a2417df276d6240cadc28ec412 Mon Sep 17 00:00:00 2001 From: mjgaughan Date: Sun, 12 May 2024 16:17:52 -0500 Subject: [PATCH] updated cross-sectional analyses --- R/.Rhistory | 816 +++++++++++++++++++++++------------------------ R/GovRiskPower.R | 53 ++- 2 files changed, 459 insertions(+), 410 deletions(-) diff --git a/R/.Rhistory b/R/.Rhistory index 8b90ad9..c51bcfc 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,412 +1,4 @@ theme_bw() -wo_df_ranef |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + -theme_bw() -# this is the file with the lmer multi-level rddAnalysis -library(tidyverse) -library(plyr) -# 0 loading the readme data in -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -# 1 preprocessing -#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers -windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) -windowed_data$week_offset <- windowed_data$week - 27 -#separate out the cleaning d -all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] -mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#find some EDA to identify which types of models might be the best for this -hist(log(all_actions_data$count)) -all_actions_data$logged_count <- log(all_actions_data$count) -all_actions_data$log1p_count <- log1p(all_actions_data$count) -# 3 rdd in lmer analysis -# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design -# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc -library(lme4) -# https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar -library(optimx) -library(lattice) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -summary(all_model) -#identifying the quartiles of effect for D -all_model_ranef <- ranef(all_model, condVar=TRUE) -dotplot(all_model_ranef) -df_ranefs <- as.data.frame(all_model_ranef) -D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),] -#below this groups the ranefs -has_zero <- function(condval, condsd){ -bounds <- condsd * 1.96 -return(ifelse(((condval - bounds) < 0),ifelse(((condval + bounds) > 0), 1, 0), 2)) -} -df_ranefs <- df_ranefs |> -mutate(ranef_grouping = has_zero(condval, condsd)) |> -mutate(rank = rank(condval)) -D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),] -hist(D_df_ranef$ranef_grouping) -D_df_ranef |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + -geom_bw() -#plot the ranefs -library(ggplot2) -D_df_ranef |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + -geom_bw() -D_df_ranef |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + -theme_bw() -#identifying the quartiles of effect for D -all_model_ranef <- ranef(all_model, condVar=TRUE) -dotplot(all_model_ranef) -df_ranefs <- as.data.frame(all_model_ranef) -#below this groups the ranefs -has_zero <- function(condval, condsd){ -bounds <- condsd * 1.96 -return(ifelse(((condval - bounds) < 0),ifelse(((condval + bounds) > 0), 1, 0), 2)) -} -df_ranefs <- df_ranefs |> -mutate(ranef_grouping = has_zero(condval, condsd)) |> -mutate(rank = rank(condval)) -D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),] -D_df_ranef |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + -theme_bw() -D_df_ranefs <- D_df_ranefs |> -mutate(rank = rank(condval)) -D_df_ranef <- D_df_ranef |> -mutate(rank = rank(condval)) -D_df_ranef |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) + -theme_bw() -#identifying the quartiles of effect for D -all_model_blup <- blup(all_model) -all_model_ranef <- ranef(all_model) -View(all_model_ranef) -df_ranefs <- as.data.frame(all_model_ranef) -dotplot(all_model_ranef) -#identifying the quartiles of effect for D -all_model_coef <- coef(all_model) -View(all_model_coef) -D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),] -D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),] -View(D_df_ranef) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -all_model_ranef <- ranef(all_model) -df_ranefs <- as.data.frame(all_model_ranef) -D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),] -View(D_df_ranef) -#identifying the quartiles of effect for D -all_model_variances <- postVar(all_model) -#identifying the quartiles of effect for D -all_model_variances <- vcov(all_model, condVar=TRUE) -View(all_model_variances) -print(all_model_variances) -View(all_model_variances) -conditional_variances_random <- lapply(all_model_variances, diag) -dotplot(conditional_variances_random) -dotplot(conditional_variances_random, -col = "blue", -pch = 19, -main = "Conditional Variances of Random Effects", -xlab = "Conditional Variance", -ylab = "Random Effect", -scales = list(x = list(log = TRUE)), -auto.key = list(space = "right")) -#identifying the quartiles of effect for D -all_model_variances <- vcov(all_model, full=TRUE, condVar=TRUE) -View(all_model_variances) -summary(all_model) -#identifying the quartiles of effect for D -all_model_variances <- vcov(all_model, full=TRUE, condVar=TRUE) -View(all_model_variances) -#identifying the quartiles of effect for D -all_model_variances <- varCorr(all_model) -#identifying the quartiles of effect for D -all_model_variances <- VarCorr(all_model) -View(all_model_variances) -View(conditional_variances_random) -View(all_model_variances) -attr(VarCorr(all_model)$upstream_vcs_link, "stddevs")^2 -values <- attr(VarCorr(all_model)$upstream_vcs_link, "stddevs")^2 -#identifying the quartiles of effect for D -all_model_variances <- vcov(all_model) -View(all_model_variances) -print(all_model_variances) -all_model_ranef <- ranef(all_model)$upstream_vcs_link -View(all_model_ranef) -all_model_ranef <- cov(ranef(all_model)) -random_effects <- ranef(all_model) -random_effects_variances <- lapply(random_effects$upstream_vcs_link, function(x) { -variances <- var(x$D:I(week_offset)) -return(variances) -}) -variances <- var(x$D) -summary_of_all <- summary(all_model) -#identifying the quartiles of effect for D -variance_components <- summary_of_all$varcor -View(variance_components) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -#identifying the quartiles of effect for D -varcorr_of_all <- VarCorr(all_model) -View(varcorr_of_all) -print(varcorr_of_all) -all_coefficients <- coef(all_model) -all_standard_errors <- sqrt(diag(vcov(all_model))) -all_conf_intervals <- cbind(coefficients - 1.96 * standard_errors, -coefficients + 1.96 * standard_errors) -all_conf_intervals <- cbind(all_coefficients - 1.96 * all_standard_errors, -all_coefficients + 1.96 * all_standard_errors) -View(all_coefficients) -View(conditional_variances_random) -View(all_coefficients) -#identifying the quartiles of effect for D -confint(all_model) -all_coefficients <- coef(all_model) -all_standard_errors <- sqrt(diag(vcov(all_model)))[3] -all_standard_errors <- sqrt(diag(vcov(all_model))) -all_standard_errors <- sqrt(diag(vcov(all_model)))[4] -all_standard_errors <- sqrt(diag(vcov(all_model)))[5] -all_standard_errors <- sqrt(diag(vcov(all_model)))[6] -all_standard_errors <- sqrt(diag(vcov(all_model)))[1] -#identifying the quartiles of effect for D -all_model_ranef <- ranef(all_model, condVar=TRUE) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -all_model_ranef <- ranef(all_model, condVar = FALSE) -View(all_model_ranef) -View(all_model_ranef_condvar) -dotplot(all_model_ranef) -dotplot(all_model_ranef_condvar) -View(all_model_ranef_condvar) -all_model_ranef_condvar[["upstream_vcs_link"]][["D"]] -View(all_model_ranef) -all_model_ranef_condvar$upstream_vcs_link -all_model_ranef_condvar$upstream_vcs_link$D -conditional_variances <- diag(vcov(model)$upstream_vcs_link$D) -conditional_variances <- diag(vcov(all_model)$upstream_vcs_link$D) -conditional_variances <- diag(vcov(all_model)) -conditional_variances <- vcov(all_model) -View(conditional_variances) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- var(ranef(all_model, condVar = TRUE)) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- var(ranef(all_model, condVar = TRUE)$upstream_vcs_link$D) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)$upstream_vcs_link$D -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -View(all_model_ranef_condvar) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -View(all_model_ranef_condvar) -attr(all_model_ranef_condvar$upstream_vcs_link$D, "condVar") -attr(all_model_ranef_condvar$upstream_vcs_link, "condVar") -df_ranefs <- as.data.frame(all_model_ranef_condvar) -View(df_ranefs) -View(all_model_ranef_condvar) -#all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -# optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -attr(all_model_ranef_condvar$upstream_vcs_link, "condVar") -#all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -# optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=TRUE) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -attr(all_model_ranef_condvar$upstream_vcs_link, "condVar") -df_ranefs <- as.data.frame(all_model_ranef_condvar) -View(df_ranefs) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -View(all_model_ranef_condvar) -all_model_ranef <- ranef(all_model, condVar = FALSE) -View(all_model_ranef_condvar) -View(all_model_ranef_condvar[["upstream_vcs_link"]]) -all_model_ranef_condvar[["upstream_vcs_link"]][["D"]] -View(all_model_ranef) -df_rn_no_cv <- as.data.frame(all_model_ranef) -View(df_rn_no_cv) -View(df_ranefs) -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar") -attr(all_model_ranef_condvar$upstream_vcs_link$D, "postVar") -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar") -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[[4]] -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[[3]] -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[[2]] -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[4] -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar") -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -isSingular(all_model) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset| upstream_vcs_link), data=all_actions_data, REML=FALSE) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (I:(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D| upstream_vcs_link), data=all_actions_data, REML=FALSE) -summary_of_all <- summary(all_model) -summary(all_model) -#identifying the quartiles of effect for D -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -attr(all_model_ranef_condvar$upstream_vcs_link, "postVar") -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -# this is the file with the lmer multi-level rddAnalysis -library(tidyverse) -library(plyr) -# 0 loading the readme data in -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -# 1 preprocessing -#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers -windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) -windowed_data$week_offset <- windowed_data$week - 27 -#separate out the cleaning d -all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] -mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -all_actions_data$log1p_count <- log1p(all_actions_data$count) -# 3 rdd in lmer analysis -# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design -# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc -library(lme4) -# https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar -library(optimx) -library(lattice) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -#identifying the quartiles of effect for D -mmcm = coef(all_model)$upstream_vcs_link[, 1] -vcov.vals = as.data.frame(VarCorr(all_model)) -View(vcov.vals) -#identifying the quartiles of effect for D -mmcm = coef(all_model)$upstream_vcs_link -View(mmcm) -summary(all_model)$coef[,2] -View(mmcm) -variance_components <- VarCorr(all_model) -group_variance <- attr(variance_components$upstream_vcs_link, "stddev")^2 -View(mmcm) -fixef(all()) -fixef(all_model -summary(all_model)$coef[,2] -fixef(all_model) -fixed_impacts = fixef(all_model) -dotplot(all_model_ranef_condvar) -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -dotplot(all_model_ranef_condvar) -broom.mixed::tidy(all_model, effects = "ran_vals", conf.int = TRUE) -test <- broom.mixed::tidy(all_model, effects = "ran_vals", conf.int = TRUE) -View(test) -all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Gamma) -all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Gamma) -all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family=poisson) -all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=poisson) -all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=binomial) -all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=binomial) -df_ranefs <- as.data.frame(all_model_ranef_condvar) -all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=binomial) -all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (1 | upstream_vcs_link), data=all_actions_data, family=poisson) -all_model_ranef_condvar <- ranef(all_gmodel, condVar = TRUE) -all_model_ranef_condvar <- ranef(all_model, condVar = TRUE) -all_gmodel_ranef_condvar <- ranef(all_gmodel, condVar = TRUE) -View(all_gmodel_ranef_condvar) -test <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) -View(test) -all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)+ scaled_project_age | upstream_vcs_link), data=all_actions_data) -test <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) -View(test) -summary(all_gmodel) -all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=TRUE, control = lmerControl( -optimizer ='optimx', optCtrl=list(method='L-BFGS-B'))) -test <- broom.mixed::tidy(all_model, effects = "ran_vals", conf.int = TRUE) -View(test) -test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) -View(test_condvals) -test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] -View(test_glmer_ranef_D) -test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] -has_zero <- function(estimate, low, high){ -return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) -} -test_glmer_ranef_D <- test_glmer_ranef_D |> -mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> -mutate(rank = rank(estimate)) -test_glmer_ranef_D |> -ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) + -geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + -theme_bw() test_glmer_ranef_D |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + @@ -510,3 +102,411 @@ all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(w # control=glmerControl(optimizer="bobyqa", # optCtrl=list(maxfun=2e5)), data=all_actions_data) all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data) +library(tidyverse) +library(plyr) +library(stringr) +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +#load in data +contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") +readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") +#some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = ends_with("new"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +mutate(after_doc = as.numeric(str_detect(window, "after"))) |> +mutate(is_collab = as.numeric(str_detect(window, "collab"))) +return(longer) +} +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} +expanded_contrib_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ +expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) +expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) +expanded_readme_data$logcount <- log(expanded_readme_data$count) +expanded_contrib_data$logcount <- log(expanded_contrib_data$count) +#breaking out the types of population counts +collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] +contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] +collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] +contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] +#import models +library(lme4) +library(optimx) +collab_readme_model <- lmer(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme, REML=FALSE) +collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme) +summary(collab_readme_model) +crm_residuals <- residuals(collab_readme_model) +qqnorm(crm_residuals) +collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_readme) +summary(collab_readme_model) +crm_residuals <- residuals(collab_readme_model) +qqnorm(crm_residuals) +saveRDS(collab_readme_model, "0510_pop_rm_collab.rda") +contrib_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_readme) +summary(contrib_readme_model) +saveRDS(contrib_readme_model, "0510_pop_rm_contrib.rda") +collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib) +summary(collab_contrib_model) +saveRDS(collab_contrib_model, "0510_pop_contrib_collab.rda") +contrib_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_contrib) +summary(contrib_contrib_model) +saveRDS(contrib_contrib_model, "0510_pop_contrib_contrib.rda") +summary(collab_readme_model) +summary(contrib_readme_model) +qqnorm(crm_residuals) +conrm_residuals <- residuals(contrib_readme_model) +qqnorm(conrm_residuals) +summary(collab_contrib_model) +summary(contrib_contrib_model) +library(ggplot2) +expanded_readme_data |> +ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) + +geom_point() + geom_jitter() +expanded_readme_data |> +ggplot(aes(x = after_doc, y = count, col = as.factor(is_collab))) + +geom_point() + geom_jitter() +expanded_readme_data |> +ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) + +geom_point() + geom_jitter() +#primary analysis for cross-sectional community metrics +overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) +octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) +readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) +contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) +overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) +mean(overall_data$mmt) +hist(overall_data$mmt, probability = TRUE) +#age_vector <- overall_data$age_of_project/365 +#quantile(age_vector) +overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +table(overall_data$new.age) +overall_data$new.age.factor <- as.factor(overall_data$new.age) +overall_data$scaled_age <- scale(overall_data$age_of_project) +mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) +summary(mmtmodel1) +qqnorm(residuals(mmtmodel1)) +summary(mmtmodel1) +octo_data$scaled_age <- scale(octo_data$age_of_project) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +g4 <- ggplot(octo_data) +g4 +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) +summary(octo_mmtmodel1) +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data) +summary(octo_mmtmodel1) +issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data) +summary(issue_mmtmodel1) +sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data) +wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data) +summary(wiki_mmtmodel1) +qqnorm(residuals(issue_mmtmodel1)) +qqnorm(residuals(wiki_mmtmodel1)) +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +library(texreg) #my little "lib" +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +summary(octo_mmtmodel1) +summary(wiki_mmtmodel1) +#left skewed data, need to transform +sum(is.na(octo_data$wiki_mmt)) +#left skewed data, need to transform +sum(is.na(octo_data$issue_mmt)) +#left skewed data, need to transform +sum(is.na(octo_data$mmt)) +test_frame <- na.omit(octo_data) +#left skewed data, need to transform +sum(is.na(octo_data$issue_contrib_count)) +#left skewed data, need to transform +sum(is.na(octo_data$wiki_contrib_count)) +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +#left skewed data, need to transform +typeof(octo_data$wiki_contrib_count) +View(octo_data) +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count)) / (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count + octo_data$issue_contrib_count) +sum(is.na(octo_data$issue_mmt)) +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +sum(is.na(octo_data$issue_mmt)) +sum(octo_data$total_contrib == 0) +#clean octo data +octo_data <- filter(octo_data, total_contrib == 0) +sum(octo_data$total_contrib == 0) +octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) +#clean octo data +octo_data <- filter(octo_data, total_contrib != 0) +octo_data$scaled_age <- scale(octo_data$age_of_project) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +#right skewed data, need to transform +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data) +summary(octo_mmtmodel1) +issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data) +issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data) +qqnorm(residuals(issue_mmtmodel1)) +sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data) +wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data) +summary(wiki_mmtmodel1) +qqnorm(residuals(wiki_mmtmodel1)) +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'Issue MMT', 'Wiki MMT'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +qqnorm(residuals(wiki_mmtmodel1)) +View(octo_data) +#TODO: find the overlap between projects with octo data and projects with readmes or contributings +readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) +contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) +octo_data |> +mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) +View(octo_data) +octo_data <- octo_data |> +mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> +mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) +View(octo_data) +#below here is the analysis for the readme.md data +cor.test(octo_data$mmt, octo_data$has_readme) +cor.test(octo_data$mmt, octo_data$has_contributing) +cor.test(octo_data$mmt, octo_data$has_contrib) +issues_expansion <- lm(issue_mmt ~ has_readme + scaled_age, data=octo_data) +summary(issues_expansion) +issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data) +summary(issues_expansion) +#below here is the analysis for the readme.md data +cor.test(octo_data$mmt, octo_data$scaled_age) +#below here is the analysis for the readme.md data +cor.test(octo_data$mmt, octo_data$scaled_age) +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data) +octo_data <- octo_data |> +mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> +mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data) +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) +summary(octo_mmtmodel1) +issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) +summary(issue_mmtmodel1) +wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) +summary(wiki_mmtmodel1) +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +qqnorm(residuals(issue_mmtmodel1)) +qqnorm(residuals(wiki_mmtmodel1)) +#below here is the analysis for the readme.md data +cor.test(octo_data$mmt, octo_data$issue_mmt) +#below here is the analysis for the readme.md data +cor.test(octo_data$mmt, octo_data$wiki_mmt) +#below here is the analysis for the readme.md data +cor.test(octo_data$mmt, octo_data$has_readme) +cor.test(octo_data$has_readme, octo_data$has_contrib) +library(readr) +library(ggplot2) +library(tidyverse) +#primary analysis for cross-sectional community metrics +overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) +octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) +readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) +contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) +overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) +mean(overall_data$mmt) +hist(overall_data$mmt, probability = TRUE) +mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) +overall_data$scaled_age <- scale(overall_data$age_of_project) +mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) +summary(mmtmodel1) +#clean octo data +octo_data <- filter(octo_data, total_contrib != 0) +octo_data$scaled_age <- scale(octo_data$age_of_project) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +#right skewed data, need to transform +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) +#find the overlap between projects with octo data and projects with readmes or contributings +readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) +contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) +octo_data <- octo_data |> +mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> +mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) +summary(octo_mmtmodel1) +mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) +summary(mmt_outcome_model) +mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) +summary(mmt_outcome_model) +mmt_outcome_model <- lm(wiki_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) +summary(mmt_outcome_model) +mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) +summary(mmt_outcome_model) +mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data) +summary(mmt_outcome_model) +overall_data <- overall_data |> +mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> +mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) +all_mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = overall_data) +summary(all_mmt_outcome_model) +#pulling in the group data for the ranef coefficients +rm_grouping <- read_csv('../051224_readme_grouped.csv',show_col_types = FALSE) +#pulling in the group data for the ranef coefficients +rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) +contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) +View(contrib_grouping) +View(rm_grouping) +View(readme_did_roster) +grouped_rm <- left_join(rm_grouping, overall_data, by = c("level","upstream_vcs_link")) +rm_grouping <- rm_grouping |> +rename(upstream_vcs_link = level) +View(rm_grouping) +grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") +View(grouped_rm) +contrib_grouping <- contrib_grouping |> +rename(upstream_vcs_link = level) +grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link") +View(grouped_rm) +#analyses +cor.test(grouped_rm$mmt, grouped_rm$ranef_grouping) +cor.test(grouped_contrib$mmt, grouped_contrib$ranef_grouping) +#analyses +cor.test(grouped_rm$underproduction_mean, grouped_rm$ranef_grouping) +cor.test(grouped_contrib$underproduction_mean, grouped_contrib$ranef_grouping) +#analyses +cor.test(grouped_rm$underproduction_mean, grouped_rm$estimate) +cor.test(grouped_contrib$underproduction_mean, grouped_contrib$estimate) +View(grouped_rm) +#test with linear model +grouping_model <- lm(underproduction_mean ~ estimate + scaled_age, data=grouped_rm) +summary(grouping_model) +#test with linear model +grouping_model <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm) +summary(grouping_model) +#test with linear model +grouping_model <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm) +summary(grouping_model) +grouping_model_contrib <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +#test with linear model +grouping_model_rm <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm) +summary(grouping_model_rm) +grouping_model_contrib <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +#test with linear model +grouping_model_rm <- glm.nb(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm) +#pulling in the group data for the ranef coefficients +rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) +contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) +rm_grouping <- rm_grouping |> +rename(upstream_vcs_link = level)|> +mutate(factored_group = as.factor(ranef_grouping)) +contrib_grouping <- contrib_grouping |> +rename(upstream_vcs_link = level) |> +mutate(factored_group = as.factor(ranef_grouping)) +grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") +grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link") +#analyses +cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group) +#test with linear model +grouping_model_rm <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_rm) +summary(grouping_model_rm) +grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +summary(grouping_model_rm) +grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +qqnorm(residuals(grouping_model_rm)) +qqnorm(residuals(grouping_model_contrib)) +rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE) +contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE) +grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link") +grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link") +#calculate in terms of July 6, 2020 +typeof(event_date) +#calculate in terms of July 6, 2020 +typeof(grouped_rm$event_date) +#calculate in terms of July 6, 2020 +typeof(as.Date(grouped_rm$event_date)) +how_long_has_file <- as.Date("2020-07-06") - as.Date(grouped_rm$event_date) +how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date)) +how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days") +#calculate in terms of July 6, 2020 +grouped_rm$event_date +#calculate in terms of July 6, 2020 +dates <- as.POSIXct(grouped_rm$event_date,tz="UTC") +dates +typeof(dates) +how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days") +#calculate in terms of July 6, 2020 +dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' '))) +#calculate in terms of July 6, 2020 +dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' '))) +#calculate in terms of July 6, 2020 +dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) +View(dtparts) +thetimes = chron(dates=dtparts[,1],times=dtparts[,2], ++ format=c('y-m-d','h:m:s')) +thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) +#calculate in terms of July 6, 2020 +library(chron) +dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) +thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) +typeof(thetimes) +grouped_rm <- grouped_rm |> +mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |> +mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) +View(grouped_rm) +#test with linear model +grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm) +summary(grouping_model_rm) +#now doing it for the contrib_data +contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' '))) +grouped_contrib <- grouped_contrib |> +mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |> +mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) +grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +summary(grouping_model_rm) +qqnorm(residuals(grouping_model_rm)) +grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +qqnorm(residuals(grouping_model_contrib)) +qqnorm(residuals(grouping_model_rm)) +qqnorm(residuals(grouping_model_contrib)) +issues_expansion <- lm(issue_mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data) +summary(issues_expansion) +govdoc_mmt <- lm(mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data) +summary(govdoc_mmt) +govdoc_mmt <- lm(mmt ~ as.factor(has_readme) + scaled_age, data=octo_data) +summary(govdoc_mmt) +govdoc_issuesmmt <- lm(issue_mmt ~ as.factor(has_readme) + scaled_age, data=octo_data) +summary(govdoc_issuesmmt) +mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) +summary(mmt_outcome_model) +all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data) +summary(all_mmt_outcome_model) +govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data) +summary(govdoc_issuesmmt) diff --git a/R/GovRiskPower.R b/R/GovRiskPower.R index 5f148af..9b94c7d 100644 --- a/R/GovRiskPower.R +++ b/R/GovRiskPower.R @@ -81,6 +81,15 @@ wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + summary(wiki_mmtmodel1) qqnorm(residuals(wiki_mmtmodel1)) +mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) +summary(mmt_outcome_model) + +all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data) +summary(all_mmt_outcome_model) + +govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data) +summary(govdoc_issuesmmt) + library(texreg) #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, @@ -93,9 +102,49 @@ contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_typ octo_data <- octo_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) +overall_data <- overall_data |> + mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> + mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$has_readme) cor.test(octo_data$mmt, octo_data$has_contrib) cor.test(octo_data$has_readme, octo_data$has_contrib) -issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data) -summary(issues_expansion) +#pulling in the group data for the ranef coefficients +rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) +contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) +rm_grouping <- rm_grouping |> + rename(upstream_vcs_link = level)|> + mutate(factored_group = as.factor(ranef_grouping)) +contrib_grouping <- contrib_grouping |> + rename(upstream_vcs_link = level) |> + mutate(factored_group = as.factor(ranef_grouping)) +grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") +grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link") +rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE) +contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE) +grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link") +grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link") +#calculate in terms of July 6, 2020 +library(chron) +dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) +thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) +typeof(thetimes) +how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days") +grouped_rm <- grouped_rm |> + mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |> + mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) +#now doing it for the contrib_data +contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' '))) +grouped_contrib <- grouped_contrib |> + mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |> + mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) +#analyses +cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group) +cor.test(grouped_contrib$underproduction_mean, grouped_contrib$factored_group) +#test with linear model +grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm) +summary(grouping_model_rm) +qqnorm(residuals(grouping_model_rm)) +grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib) +summary(grouping_model_contrib) +qqnorm(residuals(grouping_model_contrib))