updated cross-sectional analyses
This commit is contained in:
parent
fde9437843
commit
38e845ec90
816
R/.Rhistory
816
R/.Rhistory
@ -1,412 +1,4 @@
|
|||||||
theme_bw()
|
theme_bw()
|
||||||
wo_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
theme_bw()
|
|
||||||
# this is the file with the lmer multi-level rddAnalysis
|
|
||||||
library(tidyverse)
|
|
||||||
library(plyr)
|
|
||||||
# 0 loading the readme data in
|
|
||||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
||||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
|
||||||
# 1 preprocessing
|
|
||||||
#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
|
||||||
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
|
||||||
readme_df <- readme_df[,col_order]
|
|
||||||
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
|
||||||
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
|
||||||
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
|
||||||
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
|
||||||
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
|
||||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
|
||||||
# 2 some expansion needs to happens for each project
|
|
||||||
expand_timeseries <- function(project_row) {
|
|
||||||
longer <- project_row |>
|
|
||||||
pivot_longer(cols = starts_with("ct"),
|
|
||||||
names_to = "window",
|
|
||||||
values_to = "count") |>
|
|
||||||
unnest(count)
|
|
||||||
longer$observation_type <- gsub("^.*_", "", longer$window)
|
|
||||||
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
|
||||||
longer$count <- as.numeric(longer$count)
|
|
||||||
#longer <- longer[which(longer$observation_type == "all"),]
|
|
||||||
return(longer)
|
|
||||||
}
|
|
||||||
expanded_data <- expand_timeseries(readme_df[1,])
|
|
||||||
for (i in 2:nrow(readme_df)){
|
|
||||||
expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
|
|
||||||
}
|
|
||||||
#filter out the windows of time that we're looking at
|
|
||||||
window_num <- 8
|
|
||||||
windowed_data <- expanded_data |>
|
|
||||||
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
|
||||||
mutate(D = ifelse(week > 27, 1, 0))
|
|
||||||
#scale the age numbers
|
|
||||||
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
|
||||||
windowed_data$week_offset <- windowed_data$week - 27
|
|
||||||
#separate out the cleaning d
|
|
||||||
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
|
||||||
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
|
||||||
#find some EDA to identify which types of models might be the best for this
|
|
||||||
hist(log(all_actions_data$count))
|
|
||||||
all_actions_data$logged_count <- log(all_actions_data$count)
|
|
||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
|
||||||
# 3 rdd in lmer analysis
|
|
||||||
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
|
|
||||||
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
|
|
||||||
library(lme4)
|
|
||||||
# https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar
|
|
||||||
library(optimx)
|
|
||||||
library(lattice)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
summary(all_model)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef <- ranef(all_model, condVar=TRUE)
|
|
||||||
dotplot(all_model_ranef)
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef)
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),]
|
|
||||||
#below this groups the ranefs
|
|
||||||
has_zero <- function(condval, condsd){
|
|
||||||
bounds <- condsd * 1.96
|
|
||||||
return(ifelse(((condval - bounds) < 0),ifelse(((condval + bounds) > 0), 1, 0), 2))
|
|
||||||
}
|
|
||||||
df_ranefs <- df_ranefs |>
|
|
||||||
mutate(ranef_grouping = has_zero(condval, condsd)) |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),]
|
|
||||||
hist(D_df_ranef$ranef_grouping)
|
|
||||||
D_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
geom_bw()
|
|
||||||
#plot the ranefs
|
|
||||||
library(ggplot2)
|
|
||||||
D_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
geom_bw()
|
|
||||||
D_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
theme_bw()
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef <- ranef(all_model, condVar=TRUE)
|
|
||||||
dotplot(all_model_ranef)
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef)
|
|
||||||
#below this groups the ranefs
|
|
||||||
has_zero <- function(condval, condsd){
|
|
||||||
bounds <- condsd * 1.96
|
|
||||||
return(ifelse(((condval - bounds) < 0),ifelse(((condval + bounds) > 0), 1, 0), 2))
|
|
||||||
}
|
|
||||||
df_ranefs <- df_ranefs |>
|
|
||||||
mutate(ranef_grouping = has_zero(condval, condsd)) |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),]
|
|
||||||
D_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
theme_bw()
|
|
||||||
D_df_ranefs <- D_df_ranefs |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
D_df_ranef <- D_df_ranef |>
|
|
||||||
mutate(rank = rank(condval))
|
|
||||||
D_df_ranef |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= condval - (1.96 * condsd), ymax= condval + (1.96 * condsd))) +
|
|
||||||
theme_bw()
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_blup <- blup(all_model)
|
|
||||||
all_model_ranef <- ranef(all_model)
|
|
||||||
View(all_model_ranef)
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef)
|
|
||||||
dotplot(all_model_ranef)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_coef <- coef(all_model)
|
|
||||||
View(all_model_coef)
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),]
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),]
|
|
||||||
View(D_df_ranef)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
all_model_ranef <- ranef(all_model)
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef)
|
|
||||||
D_df_ranef <- df_ranefs[which(df_ranefs$term == "D"),]
|
|
||||||
View(D_df_ranef)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- postVar(all_model)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- vcov(all_model, condVar=TRUE)
|
|
||||||
View(all_model_variances)
|
|
||||||
print(all_model_variances)
|
|
||||||
View(all_model_variances)
|
|
||||||
conditional_variances_random <- lapply(all_model_variances, diag)
|
|
||||||
dotplot(conditional_variances_random)
|
|
||||||
dotplot(conditional_variances_random,
|
|
||||||
col = "blue",
|
|
||||||
pch = 19,
|
|
||||||
main = "Conditional Variances of Random Effects",
|
|
||||||
xlab = "Conditional Variance",
|
|
||||||
ylab = "Random Effect",
|
|
||||||
scales = list(x = list(log = TRUE)),
|
|
||||||
auto.key = list(space = "right"))
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- vcov(all_model, full=TRUE, condVar=TRUE)
|
|
||||||
View(all_model_variances)
|
|
||||||
summary(all_model)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- vcov(all_model, full=TRUE, condVar=TRUE)
|
|
||||||
View(all_model_variances)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- varCorr(all_model)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- VarCorr(all_model)
|
|
||||||
View(all_model_variances)
|
|
||||||
View(conditional_variances_random)
|
|
||||||
View(all_model_variances)
|
|
||||||
attr(VarCorr(all_model)$upstream_vcs_link, "stddevs")^2
|
|
||||||
values <- attr(VarCorr(all_model)$upstream_vcs_link, "stddevs")^2
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_variances <- vcov(all_model)
|
|
||||||
View(all_model_variances)
|
|
||||||
print(all_model_variances)
|
|
||||||
all_model_ranef <- ranef(all_model)$upstream_vcs_link
|
|
||||||
View(all_model_ranef)
|
|
||||||
all_model_ranef <- cov(ranef(all_model))
|
|
||||||
random_effects <- ranef(all_model)
|
|
||||||
random_effects_variances <- lapply(random_effects$upstream_vcs_link, function(x) {
|
|
||||||
variances <- var(x$D:I(week_offset))
|
|
||||||
return(variances)
|
|
||||||
})
|
|
||||||
variances <- var(x$D)
|
|
||||||
summary_of_all <- summary(all_model)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
variance_components <- summary_of_all$varcor
|
|
||||||
View(variance_components)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
varcorr_of_all <- VarCorr(all_model)
|
|
||||||
View(varcorr_of_all)
|
|
||||||
print(varcorr_of_all)
|
|
||||||
all_coefficients <- coef(all_model)
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))
|
|
||||||
all_conf_intervals <- cbind(coefficients - 1.96 * standard_errors,
|
|
||||||
coefficients + 1.96 * standard_errors)
|
|
||||||
all_conf_intervals <- cbind(all_coefficients - 1.96 * all_standard_errors,
|
|
||||||
all_coefficients + 1.96 * all_standard_errors)
|
|
||||||
View(all_coefficients)
|
|
||||||
View(conditional_variances_random)
|
|
||||||
View(all_coefficients)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
confint(all_model)
|
|
||||||
all_coefficients <- coef(all_model)
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))[3]
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))[4]
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))[5]
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))[6]
|
|
||||||
all_standard_errors <- sqrt(diag(vcov(all_model)))[1]
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef <- ranef(all_model, condVar=TRUE)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
all_model_ranef <- ranef(all_model, condVar = FALSE)
|
|
||||||
View(all_model_ranef)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
dotplot(all_model_ranef)
|
|
||||||
dotplot(all_model_ranef_condvar)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
all_model_ranef_condvar[["upstream_vcs_link"]][["D"]]
|
|
||||||
View(all_model_ranef)
|
|
||||||
all_model_ranef_condvar$upstream_vcs_link
|
|
||||||
all_model_ranef_condvar$upstream_vcs_link$D
|
|
||||||
conditional_variances <- diag(vcov(model)$upstream_vcs_link$D)
|
|
||||||
conditional_variances <- diag(vcov(all_model)$upstream_vcs_link$D)
|
|
||||||
conditional_variances <- diag(vcov(all_model))
|
|
||||||
conditional_variances <- vcov(all_model)
|
|
||||||
View(conditional_variances)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- var(ranef(all_model, condVar = TRUE))
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- var(ranef(all_model, condVar = TRUE)$upstream_vcs_link$D)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)$upstream_vcs_link$D
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link$D, "condVar")
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "condVar")
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef_condvar)
|
|
||||||
View(df_ranefs)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
#all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
# optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "condVar")
|
|
||||||
#all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
# optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=TRUE)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "condVar")
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef_condvar)
|
|
||||||
View(df_ranefs)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
all_model_ranef <- ranef(all_model, condVar = FALSE)
|
|
||||||
View(all_model_ranef_condvar)
|
|
||||||
View(all_model_ranef_condvar[["upstream_vcs_link"]])
|
|
||||||
all_model_ranef_condvar[["upstream_vcs_link"]][["D"]]
|
|
||||||
View(all_model_ranef)
|
|
||||||
df_rn_no_cv <- as.data.frame(all_model_ranef)
|
|
||||||
View(df_rn_no_cv)
|
|
||||||
View(df_ranefs)
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link$D, "postVar")
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[[4]]
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[[3]]
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[[2]]
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")[4]
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
isSingular(all_model)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (week_offset| upstream_vcs_link), data=all_actions_data, REML=FALSE)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (I:(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D| upstream_vcs_link), data=all_actions_data, REML=FALSE)
|
|
||||||
summary_of_all <- summary(all_model)
|
|
||||||
summary(all_model)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
attr(all_model_ranef_condvar$upstream_vcs_link, "postVar")
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
# this is the file with the lmer multi-level rddAnalysis
|
|
||||||
library(tidyverse)
|
|
||||||
library(plyr)
|
|
||||||
# 0 loading the readme data in
|
|
||||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
||||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
|
||||||
# 1 preprocessing
|
|
||||||
#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
|
||||||
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
|
||||||
readme_df <- readme_df[,col_order]
|
|
||||||
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
|
||||||
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
|
||||||
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
|
||||||
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
|
||||||
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
|
||||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
|
||||||
# 2 some expansion needs to happens for each project
|
|
||||||
expand_timeseries <- function(project_row) {
|
|
||||||
longer <- project_row |>
|
|
||||||
pivot_longer(cols = starts_with("ct"),
|
|
||||||
names_to = "window",
|
|
||||||
values_to = "count") |>
|
|
||||||
unnest(count)
|
|
||||||
longer$observation_type <- gsub("^.*_", "", longer$window)
|
|
||||||
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
|
||||||
longer$count <- as.numeric(longer$count)
|
|
||||||
#longer <- longer[which(longer$observation_type == "all"),]
|
|
||||||
return(longer)
|
|
||||||
}
|
|
||||||
expanded_data <- expand_timeseries(readme_df[1,])
|
|
||||||
for (i in 2:nrow(readme_df)){
|
|
||||||
expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
|
|
||||||
}
|
|
||||||
#filter out the windows of time that we're looking at
|
|
||||||
window_num <- 8
|
|
||||||
windowed_data <- expanded_data |>
|
|
||||||
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
|
||||||
mutate(D = ifelse(week > 27, 1, 0))
|
|
||||||
#scale the age numbers
|
|
||||||
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
|
||||||
windowed_data$week_offset <- windowed_data$week - 27
|
|
||||||
#separate out the cleaning d
|
|
||||||
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
|
||||||
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
|
||||||
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
|
||||||
# 3 rdd in lmer analysis
|
|
||||||
# rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design
|
|
||||||
# lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc
|
|
||||||
library(lme4)
|
|
||||||
# https://www.bristol.ac.uk/cmm/learning/videos/random-intercepts.html#exvar
|
|
||||||
library(optimx)
|
|
||||||
library(lattice)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=FALSE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
mmcm = coef(all_model)$upstream_vcs_link[, 1]
|
|
||||||
vcov.vals = as.data.frame(VarCorr(all_model))
|
|
||||||
View(vcov.vals)
|
|
||||||
#identifying the quartiles of effect for D
|
|
||||||
mmcm = coef(all_model)$upstream_vcs_link
|
|
||||||
View(mmcm)
|
|
||||||
summary(all_model)$coef[,2]
|
|
||||||
View(mmcm)
|
|
||||||
variance_components <- VarCorr(all_model)
|
|
||||||
group_variance <- attr(variance_components$upstream_vcs_link, "stddev")^2
|
|
||||||
View(mmcm)
|
|
||||||
fixef(all())
|
|
||||||
fixef(all_model
|
|
||||||
summary(all_model)$coef[,2]
|
|
||||||
fixef(all_model)
|
|
||||||
fixed_impacts = fixef(all_model)
|
|
||||||
dotplot(all_model_ranef_condvar)
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
dotplot(all_model_ranef_condvar)
|
|
||||||
broom.mixed::tidy(all_model, effects = "ran_vals", conf.int = TRUE)
|
|
||||||
test <- broom.mixed::tidy(all_model, effects = "ran_vals", conf.int = TRUE)
|
|
||||||
View(test)
|
|
||||||
all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Gamma)
|
|
||||||
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Gamma)
|
|
||||||
all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family=poisson)
|
|
||||||
all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=poisson)
|
|
||||||
all_gmodel <- glmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=binomial)
|
|
||||||
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=binomial)
|
|
||||||
df_ranefs <- as.data.frame(all_model_ranef_condvar)
|
|
||||||
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family=binomial)
|
|
||||||
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (1 | upstream_vcs_link), data=all_actions_data, family=poisson)
|
|
||||||
all_model_ranef_condvar <- ranef(all_gmodel, condVar = TRUE)
|
|
||||||
all_model_ranef_condvar <- ranef(all_model, condVar = TRUE)
|
|
||||||
all_gmodel_ranef_condvar <- ranef(all_gmodel, condVar = TRUE)
|
|
||||||
View(all_gmodel_ranef_condvar)
|
|
||||||
test <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
|
|
||||||
View(test)
|
|
||||||
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)+ scaled_project_age | upstream_vcs_link), data=all_actions_data)
|
|
||||||
test <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
|
|
||||||
View(test)
|
|
||||||
summary(all_gmodel)
|
|
||||||
all_model <- lmer(log1p_count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, REML=TRUE, control = lmerControl(
|
|
||||||
optimizer ='optimx', optCtrl=list(method='L-BFGS-B')))
|
|
||||||
test <- broom.mixed::tidy(all_model, effects = "ran_vals", conf.int = TRUE)
|
|
||||||
View(test)
|
|
||||||
test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
|
|
||||||
View(test_condvals)
|
|
||||||
test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
|
|
||||||
View(test_glmer_ranef_D)
|
|
||||||
test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
|
|
||||||
has_zero <- function(estimate, low, high){
|
|
||||||
return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
|
|
||||||
}
|
|
||||||
test_glmer_ranef_D <- test_glmer_ranef_D |>
|
|
||||||
mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
|
|
||||||
mutate(rank = rank(estimate))
|
|
||||||
test_glmer_ranef_D |>
|
|
||||||
ggplot(aes(x=rank, y=condval, col = as.factor(ranef_grouping))) +
|
|
||||||
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
|
|
||||||
theme_bw()
|
|
||||||
test_glmer_ranef_D |>
|
test_glmer_ranef_D |>
|
||||||
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
|
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
|
||||||
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
|
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
|
||||||
@ -510,3 +102,411 @@ all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(w
|
|||||||
# control=glmerControl(optimizer="bobyqa",
|
# control=glmerControl(optimizer="bobyqa",
|
||||||
# optCtrl=list(maxfun=2e5)), data=all_actions_data)
|
# optCtrl=list(maxfun=2e5)), data=all_actions_data)
|
||||||
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data)
|
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data)
|
||||||
|
library(tidyverse)
|
||||||
|
library(plyr)
|
||||||
|
library(stringr)
|
||||||
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||||
|
#load in data
|
||||||
|
contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
|
||||||
|
readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
|
||||||
|
#some expansion needs to happens for each project
|
||||||
|
expand_timeseries <- function(project_row) {
|
||||||
|
longer <- project_row |>
|
||||||
|
pivot_longer(cols = ends_with("new"),
|
||||||
|
names_to = "window",
|
||||||
|
values_to = "count") |>
|
||||||
|
unnest(count) |>
|
||||||
|
mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
|
||||||
|
mutate(is_collab = as.numeric(str_detect(window, "collab")))
|
||||||
|
return(longer)
|
||||||
|
}
|
||||||
|
expanded_readme_data <- expand_timeseries(readme_df[1,])
|
||||||
|
for (i in 2:nrow(readme_df)){
|
||||||
|
expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
|
||||||
|
}
|
||||||
|
expanded_contrib_data <- expand_timeseries(contrib_df[1,])
|
||||||
|
for (i in 2:nrow(contrib_df)){
|
||||||
|
expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
|
||||||
|
}
|
||||||
|
expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
|
||||||
|
expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
|
||||||
|
expanded_readme_data$logcount <- log(expanded_readme_data$count)
|
||||||
|
expanded_contrib_data$logcount <- log(expanded_contrib_data$count)
|
||||||
|
#breaking out the types of population counts
|
||||||
|
collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
|
||||||
|
contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
|
||||||
|
collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
|
||||||
|
contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
|
||||||
|
#import models
|
||||||
|
library(lme4)
|
||||||
|
library(optimx)
|
||||||
|
collab_readme_model <- lmer(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme, REML=FALSE)
|
||||||
|
collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme)
|
||||||
|
summary(collab_readme_model)
|
||||||
|
crm_residuals <- residuals(collab_readme_model)
|
||||||
|
qqnorm(crm_residuals)
|
||||||
|
collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_readme)
|
||||||
|
summary(collab_readme_model)
|
||||||
|
crm_residuals <- residuals(collab_readme_model)
|
||||||
|
qqnorm(crm_residuals)
|
||||||
|
saveRDS(collab_readme_model, "0510_pop_rm_collab.rda")
|
||||||
|
contrib_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_readme)
|
||||||
|
summary(contrib_readme_model)
|
||||||
|
saveRDS(contrib_readme_model, "0510_pop_rm_contrib.rda")
|
||||||
|
collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib)
|
||||||
|
summary(collab_contrib_model)
|
||||||
|
saveRDS(collab_contrib_model, "0510_pop_contrib_collab.rda")
|
||||||
|
contrib_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_contrib)
|
||||||
|
summary(contrib_contrib_model)
|
||||||
|
saveRDS(contrib_contrib_model, "0510_pop_contrib_contrib.rda")
|
||||||
|
summary(collab_readme_model)
|
||||||
|
summary(contrib_readme_model)
|
||||||
|
qqnorm(crm_residuals)
|
||||||
|
conrm_residuals <- residuals(contrib_readme_model)
|
||||||
|
qqnorm(conrm_residuals)
|
||||||
|
summary(collab_contrib_model)
|
||||||
|
summary(contrib_contrib_model)
|
||||||
|
library(ggplot2)
|
||||||
|
expanded_readme_data |>
|
||||||
|
ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) +
|
||||||
|
geom_point() + geom_jitter()
|
||||||
|
expanded_readme_data |>
|
||||||
|
ggplot(aes(x = after_doc, y = count, col = as.factor(is_collab))) +
|
||||||
|
geom_point() + geom_jitter()
|
||||||
|
expanded_readme_data |>
|
||||||
|
ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) +
|
||||||
|
geom_point() + geom_jitter()
|
||||||
|
#primary analysis for cross-sectional community metrics
|
||||||
|
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
||||||
|
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||||
|
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
||||||
|
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
|
||||||
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||||
|
mean(overall_data$mmt)
|
||||||
|
hist(overall_data$mmt, probability = TRUE)
|
||||||
|
#age_vector <- overall_data$age_of_project/365
|
||||||
|
#quantile(age_vector)
|
||||||
|
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
|
table(overall_data$new.age)
|
||||||
|
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||||
|
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
||||||
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
||||||
|
summary(mmtmodel1)
|
||||||
|
qqnorm(residuals(mmtmodel1))
|
||||||
|
summary(mmtmodel1)
|
||||||
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
||||||
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
g4 <- ggplot(octo_data)
|
||||||
|
g4
|
||||||
|
#below are the models for the octo data, there should be analysis for each one
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
#below are the models for the octo data, there should be analysis for each one
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
|
||||||
|
summary(issue_mmtmodel1)
|
||||||
|
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data)
|
||||||
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data)
|
||||||
|
summary(wiki_mmtmodel1)
|
||||||
|
qqnorm(residuals(issue_mmtmodel1))
|
||||||
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
library(texreg) #my little "lib"
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
summary(wiki_mmtmodel1)
|
||||||
|
#left skewed data, need to transform
|
||||||
|
sum(is.na(octo_data$wiki_mmt))
|
||||||
|
#left skewed data, need to transform
|
||||||
|
sum(is.na(octo_data$issue_mmt))
|
||||||
|
#left skewed data, need to transform
|
||||||
|
sum(is.na(octo_data$mmt))
|
||||||
|
test_frame <- na.omit(octo_data)
|
||||||
|
#left skewed data, need to transform
|
||||||
|
sum(is.na(octo_data$issue_contrib_count))
|
||||||
|
#left skewed data, need to transform
|
||||||
|
sum(is.na(octo_data$wiki_contrib_count))
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
#left skewed data, need to transform
|
||||||
|
typeof(octo_data$wiki_contrib_count)
|
||||||
|
View(octo_data)
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count)) / (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count + octo_data$issue_contrib_count)
|
||||||
|
sum(is.na(octo_data$issue_mmt))
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
sum(is.na(octo_data$issue_mmt))
|
||||||
|
sum(octo_data$total_contrib == 0)
|
||||||
|
#clean octo data
|
||||||
|
octo_data <- filter(octo_data, total_contrib == 0)
|
||||||
|
sum(octo_data$total_contrib == 0)
|
||||||
|
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||||
|
#clean octo data
|
||||||
|
octo_data <- filter(octo_data, total_contrib != 0)
|
||||||
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
||||||
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
#right skewed data, need to transform
|
||||||
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
#below are the models for the octo data, there should be analysis for each one
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
|
||||||
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
|
||||||
|
qqnorm(residuals(issue_mmtmodel1))
|
||||||
|
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data)
|
||||||
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data)
|
||||||
|
summary(wiki_mmtmodel1)
|
||||||
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'Issue MMT', 'Wiki MMT'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
View(octo_data)
|
||||||
|
#TODO: find the overlap between projects with octo data and projects with readmes or contributings
|
||||||
|
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
||||||
|
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
||||||
|
octo_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link))
|
||||||
|
View(octo_data)
|
||||||
|
octo_data <- octo_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
View(octo_data)
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
cor.test(octo_data$mmt, octo_data$has_readme)
|
||||||
|
cor.test(octo_data$mmt, octo_data$has_contributing)
|
||||||
|
cor.test(octo_data$mmt, octo_data$has_contrib)
|
||||||
|
issues_expansion <- lm(issue_mmt ~ has_readme + scaled_age, data=octo_data)
|
||||||
|
summary(issues_expansion)
|
||||||
|
issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data)
|
||||||
|
summary(issues_expansion)
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
cor.test(octo_data$mmt, octo_data$scaled_age)
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
cor.test(octo_data$mmt, octo_data$scaled_age)
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data)
|
||||||
|
octo_data <- octo_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data)
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
|
summary(issue_mmtmodel1)
|
||||||
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
|
summary(wiki_mmtmodel1)
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
qqnorm(residuals(issue_mmtmodel1))
|
||||||
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
cor.test(octo_data$mmt, octo_data$issue_mmt)
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
cor.test(octo_data$mmt, octo_data$wiki_mmt)
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
cor.test(octo_data$mmt, octo_data$has_readme)
|
||||||
|
cor.test(octo_data$has_readme, octo_data$has_contrib)
|
||||||
|
library(readr)
|
||||||
|
library(ggplot2)
|
||||||
|
library(tidyverse)
|
||||||
|
#primary analysis for cross-sectional community metrics
|
||||||
|
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
||||||
|
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||||
|
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
||||||
|
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
|
||||||
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||||
|
mean(overall_data$mmt)
|
||||||
|
hist(overall_data$mmt, probability = TRUE)
|
||||||
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
||||||
|
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
||||||
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
||||||
|
summary(mmtmodel1)
|
||||||
|
#clean octo data
|
||||||
|
octo_data <- filter(octo_data, total_contrib != 0)
|
||||||
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
||||||
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
#right skewed data, need to transform
|
||||||
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
|
#find the overlap between projects with octo data and projects with readmes or contributings
|
||||||
|
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
||||||
|
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
||||||
|
octo_data <- octo_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
mmt_outcome_model <- lm(wiki_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
overall_data <- overall_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
all_mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = overall_data)
|
||||||
|
summary(all_mmt_outcome_model)
|
||||||
|
#pulling in the group data for the ranef coefficients
|
||||||
|
rm_grouping <- read_csv('../051224_readme_grouped.csv',show_col_types = FALSE)
|
||||||
|
#pulling in the group data for the ranef coefficients
|
||||||
|
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
|
||||||
|
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
|
||||||
|
View(contrib_grouping)
|
||||||
|
View(rm_grouping)
|
||||||
|
View(readme_did_roster)
|
||||||
|
grouped_rm <- left_join(rm_grouping, overall_data, by = c("level","upstream_vcs_link"))
|
||||||
|
rm_grouping <- rm_grouping |>
|
||||||
|
rename(upstream_vcs_link = level)
|
||||||
|
View(rm_grouping)
|
||||||
|
grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link")
|
||||||
|
View(grouped_rm)
|
||||||
|
contrib_grouping <- contrib_grouping |>
|
||||||
|
rename(upstream_vcs_link = level)
|
||||||
|
grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
|
||||||
|
View(grouped_rm)
|
||||||
|
#analyses
|
||||||
|
cor.test(grouped_rm$mmt, grouped_rm$ranef_grouping)
|
||||||
|
cor.test(grouped_contrib$mmt, grouped_contrib$ranef_grouping)
|
||||||
|
#analyses
|
||||||
|
cor.test(grouped_rm$underproduction_mean, grouped_rm$ranef_grouping)
|
||||||
|
cor.test(grouped_contrib$underproduction_mean, grouped_contrib$ranef_grouping)
|
||||||
|
#analyses
|
||||||
|
cor.test(grouped_rm$underproduction_mean, grouped_rm$estimate)
|
||||||
|
cor.test(grouped_contrib$underproduction_mean, grouped_contrib$estimate)
|
||||||
|
View(grouped_rm)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model <- lm(underproduction_mean ~ estimate + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model)
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model_rm <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model_rm)
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model_rm <- glm.nb(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm)
|
||||||
|
#pulling in the group data for the ranef coefficients
|
||||||
|
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
|
||||||
|
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
|
||||||
|
rm_grouping <- rm_grouping |>
|
||||||
|
rename(upstream_vcs_link = level)|>
|
||||||
|
mutate(factored_group = as.factor(ranef_grouping))
|
||||||
|
contrib_grouping <- contrib_grouping |>
|
||||||
|
rename(upstream_vcs_link = level) |>
|
||||||
|
mutate(factored_group = as.factor(ranef_grouping))
|
||||||
|
grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link")
|
||||||
|
grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
|
||||||
|
#analyses
|
||||||
|
cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model_rm <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model_rm)
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
summary(grouping_model_rm)
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
qqnorm(residuals(grouping_model_rm))
|
||||||
|
qqnorm(residuals(grouping_model_contrib))
|
||||||
|
rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE)
|
||||||
|
contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE)
|
||||||
|
grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link")
|
||||||
|
grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link")
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
typeof(event_date)
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
typeof(grouped_rm$event_date)
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
typeof(as.Date(grouped_rm$event_date))
|
||||||
|
how_long_has_file <- as.Date("2020-07-06") - as.Date(grouped_rm$event_date)
|
||||||
|
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date))
|
||||||
|
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
grouped_rm$event_date
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
dates <- as.POSIXct(grouped_rm$event_date,tz="UTC")
|
||||||
|
dates
|
||||||
|
typeof(dates)
|
||||||
|
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' ')))
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' ')))
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
|
||||||
|
View(dtparts)
|
||||||
|
thetimes = chron(dates=dtparts[,1],times=dtparts[,2],
|
||||||
|
+ format=c('y-m-d','h:m:s'))
|
||||||
|
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
library(chron)
|
||||||
|
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
|
||||||
|
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
|
||||||
|
typeof(thetimes)
|
||||||
|
grouped_rm <- grouped_rm |>
|
||||||
|
mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |>
|
||||||
|
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
|
||||||
|
View(grouped_rm)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model_rm)
|
||||||
|
#now doing it for the contrib_data
|
||||||
|
contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' ')))
|
||||||
|
grouped_contrib <- grouped_contrib |>
|
||||||
|
mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |>
|
||||||
|
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
summary(grouping_model_rm)
|
||||||
|
qqnorm(residuals(grouping_model_rm))
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
qqnorm(residuals(grouping_model_contrib))
|
||||||
|
qqnorm(residuals(grouping_model_rm))
|
||||||
|
qqnorm(residuals(grouping_model_contrib))
|
||||||
|
issues_expansion <- lm(issue_mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data)
|
||||||
|
summary(issues_expansion)
|
||||||
|
govdoc_mmt <- lm(mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data)
|
||||||
|
summary(govdoc_mmt)
|
||||||
|
govdoc_mmt <- lm(mmt ~ as.factor(has_readme) + scaled_age, data=octo_data)
|
||||||
|
summary(govdoc_mmt)
|
||||||
|
govdoc_issuesmmt <- lm(issue_mmt ~ as.factor(has_readme) + scaled_age, data=octo_data)
|
||||||
|
summary(govdoc_issuesmmt)
|
||||||
|
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data)
|
||||||
|
summary(all_mmt_outcome_model)
|
||||||
|
govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
|
||||||
|
summary(govdoc_issuesmmt)
|
||||||
|
@ -81,6 +81,15 @@ wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme +
|
|||||||
summary(wiki_mmtmodel1)
|
summary(wiki_mmtmodel1)
|
||||||
qqnorm(residuals(wiki_mmtmodel1))
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
|
||||||
|
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
|
||||||
|
summary(mmt_outcome_model)
|
||||||
|
|
||||||
|
all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data)
|
||||||
|
summary(all_mmt_outcome_model)
|
||||||
|
|
||||||
|
govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
|
||||||
|
summary(govdoc_issuesmmt)
|
||||||
|
|
||||||
library(texreg) #my little "lib"
|
library(texreg) #my little "lib"
|
||||||
|
|
||||||
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
@ -93,9 +102,49 @@ contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_typ
|
|||||||
octo_data <- octo_data |>
|
octo_data <- octo_data |>
|
||||||
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
|
overall_data <- overall_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
#below here is the analysis for the readme.md data
|
#below here is the analysis for the readme.md data
|
||||||
cor.test(octo_data$mmt, octo_data$has_readme)
|
cor.test(octo_data$mmt, octo_data$has_readme)
|
||||||
cor.test(octo_data$mmt, octo_data$has_contrib)
|
cor.test(octo_data$mmt, octo_data$has_contrib)
|
||||||
cor.test(octo_data$has_readme, octo_data$has_contrib)
|
cor.test(octo_data$has_readme, octo_data$has_contrib)
|
||||||
issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data)
|
#pulling in the group data for the ranef coefficients
|
||||||
summary(issues_expansion)
|
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
|
||||||
|
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
|
||||||
|
rm_grouping <- rm_grouping |>
|
||||||
|
rename(upstream_vcs_link = level)|>
|
||||||
|
mutate(factored_group = as.factor(ranef_grouping))
|
||||||
|
contrib_grouping <- contrib_grouping |>
|
||||||
|
rename(upstream_vcs_link = level) |>
|
||||||
|
mutate(factored_group = as.factor(ranef_grouping))
|
||||||
|
grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link")
|
||||||
|
grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
|
||||||
|
rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE)
|
||||||
|
contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE)
|
||||||
|
grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link")
|
||||||
|
grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link")
|
||||||
|
#calculate in terms of July 6, 2020
|
||||||
|
library(chron)
|
||||||
|
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
|
||||||
|
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
|
||||||
|
typeof(thetimes)
|
||||||
|
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
|
||||||
|
grouped_rm <- grouped_rm |>
|
||||||
|
mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |>
|
||||||
|
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
|
||||||
|
#now doing it for the contrib_data
|
||||||
|
contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' ')))
|
||||||
|
grouped_contrib <- grouped_contrib |>
|
||||||
|
mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |>
|
||||||
|
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
|
||||||
|
#analyses
|
||||||
|
cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group)
|
||||||
|
cor.test(grouped_contrib$underproduction_mean, grouped_contrib$factored_group)
|
||||||
|
#test with linear model
|
||||||
|
grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm)
|
||||||
|
summary(grouping_model_rm)
|
||||||
|
qqnorm(residuals(grouping_model_rm))
|
||||||
|
grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
|
||||||
|
summary(grouping_model_contrib)
|
||||||
|
qqnorm(residuals(grouping_model_contrib))
|
||||||
|
Loading…
Reference in New Issue
Block a user