updates to some of the R scripts

This commit is contained in:
mjgaughan 2024-06-19 19:40:22 -04:00
parent b48a684185
commit 379a8929a5
7 changed files with 6287 additions and 3518 deletions

BIN
R/.RData

Binary file not shown.

View File

@ -1,330 +1,450 @@
theme_bw() mutate(crescendo_limit = ifelse(week_offset < (-4), 0, 1))|>
test_glmer_ranef_D |> cor.test(crescendo_limit, count)
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + #checking crescendo of contributions before document publication
theme_bw() #second window
summary(all_gmodel) second_windowed_data <- windowed_data |>
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data) filter(week_offset <= 0) |>
summary(all_gmodel) mutate(crescendo_limit = ifelse(week_offset < (-2), 0, 1))
test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] #checking crescendo of contributions before document publication
has_zero <- function(estimate, low, high){ #second window
return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) second_windowed_data <- windowed_data |>
} filter(week_offset <= 0) |>
test_glmer_ranef_D <- test_glmer_ranef_D |> mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
mutate(rank = rank(estimate)) #checking crescendo of contributions before document publication
test_glmer_ranef_D |> #second window
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + second_windowed_data <- all_actions_data |>
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + filter(week_offset <= 0) |>
theme_bw() mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
View(test_glmer_ranef_D) #testing whether there's a correlation between count and the presce
View(test_condvals) cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data)
summary(all_gmodel)
test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
View(test_condvals)
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = Poisson)
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = poisson)
summary(all_gmodel)
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data, family = poisson)
summary(all_gmodel)
test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
has_zero <- function(estimate, low, high){
return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
}
test_glmer_ranef_D <- test_glmer_ranef_D |>
mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
mutate(rank = rank(estimate))
test_glmer_ranef_D |>
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
theme_bw()
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, family = poisson)
all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
summary(all_gmodel)
test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
has_zero <- function(estimate, low, high){
return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
}
test_glmer_ranef_D <- test_glmer_ranef_D |>
mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
mutate(rank = rank(estimate))
test_glmer_ranef_D |>
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
theme_bw()
variance(all_actions_data$log1p_count)
var(all_actions_data$log1p_count)
mean (all_actions_data$log1p_count)
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link),data=all_actions_data)
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link),
control=glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun=2e5)), data=all_actions_data)
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link),
control=glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun=2e5)), data=all_actions_data)
summary(all_gmodel)
test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE)
test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),]
has_zero <- function(estimate, low, high){
return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2))
}
test_glmer_ranef_D <- test_glmer_ranef_D |>
mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |>
mutate(rank = rank(estimate))
test_glmer_ranef_D |>
ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) +
geom_linerange(aes(ymin= conf.low, ymax= conf.high)) +
theme_bw()
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
#all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
# control=glmerControl(optimizer="bobyqa",
# optCtrl=list(maxfun=2e5)), data=all_actions_data)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data)
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
#all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
# control=glmerControl(optimizer="bobyqa",
# optCtrl=list(maxfun=2e5)), data=all_actions_data)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data, verbose=TRUE)
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
#all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
# control=glmerControl(optimizer="bobyqa",
# optCtrl=list(maxfun=2e5)), data=all_actions_data)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link), data=all_actions_data)
#all_gmodel <- glmer(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset)| upstream_vcs_link), data=all_actions_data, nAGQ=0, family = poisson)
#all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D * I(week_offset) | upstream_vcs_link),
# control=glmerControl(optimizer="bobyqa",
# optCtrl=list(maxfun=2e5)), data=all_actions_data)
all_gmodel <- glmer.nb(count ~ D * I(week_offset)+ scaled_project_age + (D | upstream_vcs_link), data=all_actions_data)
library(tidyverse) library(tidyverse)
library(plyr) library(plyr)
library(stringr) #get the contrib data instead
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
#load in data contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") #some preprocessing and expansion
readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
#some expansion needs to happens for each project contrib_df <- contrib_df[,col_order]
contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
# 2 some expansion needs to happens for each project
expand_timeseries <- function(project_row) { expand_timeseries <- function(project_row) {
longer <- project_row |> longer <- project_row |>
pivot_longer(cols = ends_with("new"), pivot_longer(cols = starts_with("ct"),
names_to = "window", names_to = "window",
values_to = "count") |> values_to = "count") |>
unnest(count) |> unnest(count)
mutate(after_doc = as.numeric(str_detect(window, "after"))) |> longer$observation_type <- gsub("^.*_", "", longer$window)
mutate(is_collab = as.numeric(str_detect(window, "collab"))) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
longer$count <- as.numeric(longer$count)
#longer <- longer[which(longer$observation_type == "all"),]
return(longer) return(longer)
} }
expanded_readme_data <- expand_timeseries(readme_df[1,]) expanded_data <- expand_timeseries(contrib_df[1,])
for (i in 2:nrow(readme_df)){
expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
}
expanded_contrib_data <- expand_timeseries(contrib_df[1,])
for (i in 2:nrow(contrib_df)){ for (i in 2:nrow(contrib_df)){
expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
} }
expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) #filter out the windows of time that we're looking at
expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) window_num <- 8
expanded_readme_data$logcount <- log(expanded_readme_data$count) windowed_data <- expanded_data |>
expanded_contrib_data$logcount <- log(expanded_contrib_data$count) filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
#breaking out the types of population counts mutate(D = ifelse(week > 27, 1, 0))
collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] #scale the age numbers and calculate the week offset here
contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] windowed_data$week_offset <- windowed_data$week - 27
contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] #break out the different type of commit actions
#import models all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
library(lme4) mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
library(optimx) #logging
collab_readme_model <- lmer(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme, REML=FALSE) all_actions_data$logged_count <- log(all_actions_data$count)
collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (1| upstream_vcs_link), data=collab_pop_readme) all_actions_data$log1p_count <- log1p(all_actions_data$count)
summary(collab_readme_model) # now for merge
crm_residuals <- residuals(collab_readme_model) mrg_actions_data$logged_count <- log(mrg_actions_data$count)
qqnorm(crm_residuals) mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
collab_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_readme) #checking crescendo of contributions before document publication
summary(collab_readme_model) #second window
crm_residuals <- residuals(collab_readme_model) second_windowed_data <- all_actions_data |>
qqnorm(crm_residuals) filter(week_offset <= 0) |>
saveRDS(collab_readme_model, "0510_pop_rm_collab.rda") mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
contrib_readme_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_readme) #testing whether there's a correlation between count and the two weeks before the introduction
summary(contrib_readme_model) cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
saveRDS(contrib_readme_model, "0510_pop_rm_contrib.rda") #checking crescendo of contributions before document publication
collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib) #second window
summary(collab_contrib_model) second_windowed_data <- all_actions_data |>
saveRDS(collab_contrib_model, "0510_pop_contrib_collab.rda") filter(week_offset <= 0) |>
contrib_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=contrib_pop_contrib) mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
summary(contrib_contrib_model) #testing whether there's a correlation between count and the two weeks before the introduction
saveRDS(contrib_contrib_model, "0510_pop_contrib_contrib.rda") cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
summary(collab_readme_model) #checking crescendo of contributions before document publication
summary(contrib_readme_model) #second window
qqnorm(crm_residuals) second_windowed_data <- all_actions_data |>
conrm_residuals <- residuals(contrib_readme_model) filter(week_offset <= 0) |>
qqnorm(conrm_residuals) mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
summary(collab_contrib_model) #testing whether there's a correlation between count and the two weeks before the introduction
summary(contrib_contrib_model) cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
# this is the file with the lmer multi-level rddAnalysis
library(tidyverse)
library(plyr)
# 0 loading the readme data in
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../final_data/deb_readme_did.csv")
# 1 preprocessing
#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
readme_df <- readme_df[,col_order]
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
readme_df = readme_df[,!(names(readme_df) %in% drop)]
# 2 some expansion needs to happens for each project
expand_timeseries <- function(project_row) {
longer <- project_row |>
pivot_longer(cols = starts_with("ct"),
names_to = "window",
values_to = "count") |>
unnest(count)
longer$observation_type <- gsub("^.*_", "", longer$window)
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
longer$count <- as.numeric(longer$count)
#longer <- longer[which(longer$observation_type == "all"),]
return(longer)
}
expanded_data <- expand_timeseries(readme_df[1,])
for (i in 2:nrow(readme_df)){
expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
}
#filter out the windows of time that we're looking at
window_num <- 8
windowed_data <- expanded_data |>
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
mutate(D = ifelse(week > 27, 1, 0))
#scale the age numbers
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
windowed_data$week_offset <- windowed_data$week - 27
#break out the different types of commit actions that are studied
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
#log the dependent
all_actions_data$logged_count <- log(all_actions_data$count)
all_actions_data$log1p_count <- log1p(all_actions_data$count)
#checking crescendo of contributions before document publication
#second window
second_windowed_data <- all_actions_data |>
filter(week_offset <= 0) |>
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
#testing whether there's a correlation between count and the two weeks before the introduction
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
crescendow_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
summary(crescendo_huh)
#checking crescendo of contributions before document publication
#second window
second_windowed_data <- all_actions_data |>
filter(week_offset <= 0) |>
mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
#testing whether there's a correlation between count and the two weeks before the introduction
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
summary(crescendo_huh)
library(tidyverse)
library(plyr)
#get the contrib data instead
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
#some preprocessing and expansion
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
contrib_df <- contrib_df[,col_order]
contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
# 2 some expansion needs to happens for each project
expand_timeseries <- function(project_row) {
longer <- project_row |>
pivot_longer(cols = starts_with("ct"),
names_to = "window",
values_to = "count") |>
unnest(count)
longer$observation_type <- gsub("^.*_", "", longer$window)
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
longer$count <- as.numeric(longer$count)
#longer <- longer[which(longer$observation_type == "all"),]
return(longer)
}
expanded_data <- expand_timeseries(contrib_df[1,])
for (i in 2:nrow(contrib_df)){
expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
}
#filter out the windows of time that we're looking at
window_num <- 8
windowed_data <- expanded_data |>
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
mutate(D = ifelse(week > 27, 1, 0))
#scale the age numbers and calculate the week offset here
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
windowed_data$week_offset <- windowed_data$week - 27
#break out the different type of commit actions
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
#logging
all_actions_data$logged_count <- log(all_actions_data$count)
all_actions_data$log1p_count <- log1p(all_actions_data$count)
# now for merge
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
#checking crescendo of contributions before document publication
#second window
second_windowed_data <- all_actions_data |>
filter(week_offset <= 0) |>
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
#testing whether there's a correlation between count and the two weeks before the introduction
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
summary(crescendo_huh)
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
summary(crescendo_huh)
# this is the file with the lmer multi-level rddAnalysis
library(tidyverse)
library(plyr)
# 0 loading the readme data in
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../final_data/deb_readme_did.csv")
# 1 preprocessing
#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
readme_df <- readme_df[,col_order]
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
readme_df = readme_df[,!(names(readme_df) %in% drop)]
# 2 some expansion needs to happens for each project
expand_timeseries <- function(project_row) {
longer <- project_row |>
pivot_longer(cols = starts_with("ct"),
names_to = "window",
values_to = "count") |>
unnest(count)
longer$observation_type <- gsub("^.*_", "", longer$window)
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
longer$count <- as.numeric(longer$count)
#longer <- longer[which(longer$observation_type == "all"),]
return(longer)
}
expanded_data <- expand_timeseries(readme_df[1,])
for (i in 2:nrow(readme_df)){
expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
}
#filter out the windows of time that we're looking at
window_num <- 8
windowed_data <- expanded_data |>
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
mutate(D = ifelse(week > 27, 1, 0))
#scale the age numbers
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
windowed_data$week_offset <- windowed_data$week - 27
#break out the different types of commit actions that are studied
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
#log the dependent
all_actions_data$logged_count <- log(all_actions_data$count)
all_actions_data$log1p_count <- log1p(all_actions_data$count)
#checking crescendo of contributions before document publication
#second window
second_windowed_data <- all_actions_data |>
filter(week_offset <= 0) |>
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
#testing whether there's a correlation between count and the two weeks before the introduction
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
summary(crescendo_huh)
#checking crescendo of contributions before document publication
#second window
second_windowed_data <- all_actions_data |>
filter(week_offset <= 0) |>
mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
#testing whether there's a correlation between count and the two weeks before the introduction
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
summary(crescendo_huh)
#checking crescendo of contributions before document publication
#second window
second_windowed_data <- all_actions_data |>
filter(week_offset <= 0) |>
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
#testing whether there's a correlation between count and the two weeks before the introduction
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
summary(crescendo_huh)
library(tidyverse)
library(plyr)
# script for the analysis of document readability metrics
# readability metrics will be studied controlled by their length
# gaughan@u.northwestern.edu
# loading in the data
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
head(readme_df)
readme_df <- readme_df |>
mutate(coef_grouping <- as.factor(subdir))
cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
readme_df <- readme_df |>
mutate(coef_grouping <- as.factor(subdir))
cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
cor(readme_df$coef_grouping, readme_df$flesch_reading_ease)
readme_df <- readme_df |>
mutate(coef_grouping <- as.factor(subdir))
test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
readme_df <- readme_df |>
mutate(coef_grouping <- as.factor(subdir))
test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
test_lm <- lm(flesch_reading_ease ~ subdir,data=readme_df)
summary(test_lm)
test_lm <- lm(flesch_reading_ease ~ as.factor(subdir),data=readme_df)
summary(test_lm)
head(readme_df)
test_lm <- lm(flesch_reading_ease ~ char_count + as.factor(subdir),data=readme_df)
summary(test_lm)
head(readme_df)
test_lm <- lm(linsear_write_formula ~ char_count + as.factor(subdir),data=readme_df)
summary(test_lm)
head(readme_df)
test_lm <- lm(mcalpine_eflaw ~ char_count + as.factor(subdir),data=readme_df)
summary(test_lm)
test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
summary(test_lm)
aggregate(readme_df[, 3:11], list(readme_df$subdir), mean)
aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
#readme_df <- readme_df |>
# mutate(coef_grouping <- as.factor(subdir))
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
#summary(test_lm)
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean)
library(tidyverse)
library(plyr)
# script for the analysis of document readability metrics
# readability metrics will be studied controlled by their length
# gaughan@u.northwestern.edu
# loading in the data
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
head(readme_df)
aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
#readme_df <- readme_df |>
# mutate(coef_grouping <- as.factor(subdir))
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
#summary(test_lm)
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
rm(list=ls())
set.seed(424242)
library(readr)
library(ggplot2) library(ggplot2)
expanded_readme_data |> library(tidyverse)
ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) + overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
geom_point() + geom_jitter()
expanded_readme_data |>
ggplot(aes(x = after_doc, y = count, col = as.factor(is_collab))) +
geom_point() + geom_jitter()
expanded_readme_data |>
ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(is_collab))) +
geom_point() + geom_jitter()
#primary analysis for cross-sectional community metrics
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
mean(overall_data$mmt) mean(overall_data$mmt)
hist(overall_data$mmt, probability = TRUE) hist(overall_data$mmt, probability = TRUE)
#age_vector <- overall_data$age_of_project/365 #the basic stuff for the overall data
#quantile(age_vector) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) mean(overall_data$mmt)
table(overall_data$new.age) hist(overall_data$mmt, probability = TRUE)
overall_data$new.age.factor <- as.factor(overall_data$new.age) #some new variables around age
#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
#table(overall_data$new.age)
#overall_data$new.age.factor <- as.factor(overall_data$new.age)
overall_data$scaled_age <- scale(overall_data$age_of_project) overall_data$scaled_age <- scale(overall_data$age_of_project)
#model
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
summary(mmtmodel1) summary(mmtmodel1)
qqnorm(residuals(mmtmodel1)) qqnorm(residuals(mmtmodel1))
summary(mmtmodel1) # below this is the analysis for the octo data
octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) table(octo_data$new.age)
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) octo_data$new.age.factor <- as.factor(octo_data$new.age)
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
g4 <- ggplot(octo_data)
g4
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data)
summary(octo_mmtmodel1)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
summary(issue_mmtmodel1)
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data)
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data)
summary(wiki_mmtmodel1)
qqnorm(residuals(issue_mmtmodel1))
qqnorm(residuals(wiki_mmtmodel1))
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
library(texreg) #my little "lib"
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
summary(octo_mmtmodel1)
summary(wiki_mmtmodel1)
#left skewed data, need to transform
sum(is.na(octo_data$wiki_mmt))
#left skewed data, need to transform
sum(is.na(octo_data$issue_mmt))
#left skewed data, need to transform
sum(is.na(octo_data$mmt))
test_frame <- na.omit(octo_data)
#left skewed data, need to transform
sum(is.na(octo_data$issue_contrib_count))
#left skewed data, need to transform
sum(is.na(octo_data$wiki_contrib_count))
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
#left skewed data, need to transform
typeof(octo_data$wiki_contrib_count)
View(octo_data)
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count)) / (octo_data$api_contrib_count + octo_data$file_contrib_count + octo_data$wiki_contrib_count + octo_data$issue_contrib_count)
sum(is.na(octo_data$issue_mmt))
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
sum(is.na(octo_data$issue_mmt))
sum(octo_data$total_contrib == 0)
#clean octo data
octo_data <- filter(octo_data, total_contrib == 0)
sum(octo_data$total_contrib == 0)
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
#clean octo data
octo_data <- filter(octo_data, total_contrib != 0)
octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$scaled_age <- scale(octo_data$age_of_project)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
head(octo_data)
#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
#right skewed data, need to transform #right skewed data, need to transform
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
#below are the models for the octo data, there should be analysis for each one #below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=octo_data)
summary(octo_mmtmodel1)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age, data=octo_data)
qqnorm(residuals(issue_mmtmodel1))
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + scaled_age, data=octo_data)
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age, data=octo_data)
summary(wiki_mmtmodel1)
qqnorm(residuals(wiki_mmtmodel1))
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'scaled_age', 'Wiki'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'Issue MMT', 'Wiki MMT'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
qqnorm(residuals(wiki_mmtmodel1))
View(octo_data)
#TODO: find the overlap between projects with octo data and projects with readmes or contributings
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
octo_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link))
View(octo_data)
octo_data <- octo_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
View(octo_data)
#below here is the analysis for the readme.md data
cor.test(octo_data$mmt, octo_data$has_readme)
cor.test(octo_data$mmt, octo_data$has_contributing)
cor.test(octo_data$mmt, octo_data$has_contrib)
issues_expansion <- lm(issue_mmt ~ has_readme + scaled_age, data=octo_data)
summary(issues_expansion)
issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data)
summary(issues_expansion)
#below here is the analysis for the readme.md data
cor.test(octo_data$mmt, octo_data$scaled_age)
#below here is the analysis for the readme.md data
cor.test(octo_data$mmt, octo_data$scaled_age)
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data)
octo_data <- octo_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contributing, data=octo_data)
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(octo_mmtmodel1) summary(octo_mmtmodel1)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(issue_mmtmodel1) summary(issue_mmtmodel1)
qqnorm(residuals(issue_mmtmodel1))
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(wiki_mmtmodel1) summary(wiki_mmtmodel1)
#getting some of the information in about whether projects have specific files
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
octo_data <- octo_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
overall_data <- overall_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(octo_mmtmodel1)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(issue_mmtmodel1)
qqnorm(residuals(issue_mmtmodel1))
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(wiki_mmtmodel1)
qqnorm(residuals(wiki_mmtmodel1))
#these next three are looking at mmt as an outcome of other factors
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
summary(mmt_outcome_model)
library(texreg) #my little "lib"
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
use.packages=FALSE, table=FALSE, ci.force = TRUE) use.packages=FALSE, table=FALSE, ci.force = TRUE)
govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
summary(govdoc_issuesmmt)
View(octo_data)
octo_cleaned <- octo_data[octo_data$issue_mmt != NaN]
octo_cleaned <- octo_data[!is.nan(octo_data$issue_mmt),]
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
summary(octo_mmtmodel1)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
summary(issue_mmtmodel1)
qqnorm(residuals(issue_mmtmodel1)) qqnorm(residuals(issue_mmtmodel1))
qqnorm(residuals(wiki_mmtmodel1)) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
#below here is the analysis for the readme.md data summary(wiki_mmtmodel1)
cor.test(octo_data$mmt, octo_data$issue_mmt) write.csv(octo_cleaned,"cleaned_octo.csv", row.names = FALSE)
#below here is the analysis for the readme.md data texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
cor.test(octo_data$mmt, octo_data$wiki_mmt) custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
#below here is the analysis for the readme.md data custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
cor.test(octo_data$mmt, octo_data$has_readme) use.packages=FALSE, table=FALSE, ci.force = TRUE)
cor.test(octo_data$has_readme, octo_data$has_contrib) rm(list=ls())
set.seed(424242)
library(readr) library(readr)
library(ggplot2) library(ggplot2)
library(tidyverse) library(tidyverse)
@ -335,178 +455,58 @@ readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types =
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
mean(overall_data$mmt) mean(overall_data$mmt)
#the basic stuff for the overall data
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
mean(overall_data$mmt)
hist(overall_data$mmt, probability = TRUE) hist(overall_data$mmt, probability = TRUE)
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) #model
overall_data$scaled_age <- scale(overall_data$age_of_project)
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
summary(mmtmodel1) summary(mmtmodel1)
qqnorm(residuals(mmtmodel1))
#clean octo data #clean octo data
octo_data <- filter(octo_data, total_contrib != 0) octo_data <- filter(octo_data, total_contrib != 0)
#some new variables around age
#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
#table(overall_data$new.age)
#overall_data$new.age.factor <- as.factor(overall_data$new.age)
overall_data$scaled_age <- scale(overall_data$age_of_project)
#model
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$scaled_age <- scale(octo_data$age_of_project)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
head(octo_data)
#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
#right skewed data, need to transform #right skewed data, need to transform
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) #getting some of the information in about whether projects have specific files
#find the overlap between projects with octo data and projects with readmes or contributings
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
octo_data <- octo_data |> octo_data <- octo_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(octo_mmtmodel1)
mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
summary(mmt_outcome_model)
mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
summary(mmt_outcome_model)
mmt_outcome_model <- lm(wiki_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
summary(mmt_outcome_model)
mmt_outcome_model <- lm(issue_mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
summary(mmt_outcome_model)
mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = octo_data)
summary(mmt_outcome_model)
overall_data <- overall_data |> overall_data <- overall_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
all_mmt_outcome_model <- lm(mmt ~ scaled_age + has_readme + has_contrib, data = overall_data) #below are the models for the octo data, there should be analysis for each one
summary(all_mmt_outcome_model) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
#pulling in the group data for the ranef coefficients summary(octo_mmtmodel1)
rm_grouping <- read_csv('../051224_readme_grouped.csv',show_col_types = FALSE) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
#pulling in the group data for the ranef coefficients summary(issue_mmtmodel1)
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) qqnorm(residuals(issue_mmtmodel1))
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
View(contrib_grouping) summary(wiki_mmtmodel1)
View(rm_grouping) library(texreg) #my little "lib"
View(readme_did_roster) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
grouped_rm <- left_join(rm_grouping, overall_data, by = c("level","upstream_vcs_link")) custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
rm_grouping <- rm_grouping |> custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
rename(upstream_vcs_link = level) use.packages=FALSE, table=FALSE, ci.force = TRUE)
View(rm_grouping) #now large MMT model taking into account having contributing or README
grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data)
View(grouped_rm) summary(mmtmodel2)
contrib_grouping <- contrib_grouping |> qqnorm(residuals(mmtmodel2))
rename(upstream_vcs_link = level) summary(mmtmodel2)
grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
View(grouped_rm)
#analyses
cor.test(grouped_rm$mmt, grouped_rm$ranef_grouping)
cor.test(grouped_contrib$mmt, grouped_contrib$ranef_grouping)
#analyses
cor.test(grouped_rm$underproduction_mean, grouped_rm$ranef_grouping)
cor.test(grouped_contrib$underproduction_mean, grouped_contrib$ranef_grouping)
#analyses
cor.test(grouped_rm$underproduction_mean, grouped_rm$estimate)
cor.test(grouped_contrib$underproduction_mean, grouped_contrib$estimate)
View(grouped_rm)
#test with linear model
grouping_model <- lm(underproduction_mean ~ estimate + scaled_age, data=grouped_rm)
summary(grouping_model)
#test with linear model
grouping_model <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm)
summary(grouping_model)
#test with linear model
grouping_model <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm)
summary(grouping_model)
grouping_model_contrib <- lm(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_contrib)
summary(grouping_model_contrib)
#test with linear model
grouping_model_rm <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_rm)
summary(grouping_model_rm)
grouping_model_contrib <- lm(underproduction_mean ~ estimate + mmt + scaled_age, data=grouped_contrib)
summary(grouping_model_contrib)
#test with linear model
grouping_model_rm <- glm.nb(underproduction_mean ~ ranef_grouping + mmt + scaled_age, data=grouped_rm)
#pulling in the group data for the ranef coefficients
rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE)
contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE)
rm_grouping <- rm_grouping |>
rename(upstream_vcs_link = level)|>
mutate(factored_group = as.factor(ranef_grouping))
contrib_grouping <- contrib_grouping |>
rename(upstream_vcs_link = level) |>
mutate(factored_group = as.factor(ranef_grouping))
grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link")
grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link")
#analyses
cor.test(grouped_rm$underproduction_mean, grouped_rm$factored_group)
#test with linear model
grouping_model_rm <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_rm)
summary(grouping_model_rm)
grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib)
summary(grouping_model_contrib)
summary(grouping_model_rm)
grouping_model_contrib <- lm(underproduction_mean ~ factored_group + mmt + scaled_age, data=grouped_contrib)
summary(grouping_model_contrib)
qqnorm(residuals(grouping_model_rm))
qqnorm(residuals(grouping_model_contrib))
rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE)
contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE)
grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link")
grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link")
#calculate in terms of July 6, 2020
typeof(event_date)
#calculate in terms of July 6, 2020
typeof(grouped_rm$event_date)
#calculate in terms of July 6, 2020
typeof(as.Date(grouped_rm$event_date))
how_long_has_file <- as.Date("2020-07-06") - as.Date(grouped_rm$event_date)
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date))
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
#calculate in terms of July 6, 2020
grouped_rm$event_date
#calculate in terms of July 6, 2020
dates <- as.POSIXct(grouped_rm$event_date,tz="UTC")
dates
typeof(dates)
how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days")
#calculate in terms of July 6, 2020
dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' ')))
#calculate in terms of July 6, 2020
dtparts = t(as.data.frame(strsplit(grouped_rm$event_date,' ')))
#calculate in terms of July 6, 2020
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
View(dtparts)
thetimes = chron(dates=dtparts[,1],times=dtparts[,2],
+ format=c('y-m-d','h:m:s'))
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
#calculate in terms of July 6, 2020
library(chron)
dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' ')))
thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))
typeof(thetimes)
grouped_rm <- grouped_rm |>
mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |>
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
View(grouped_rm)
#test with linear model
grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm)
summary(grouping_model_rm)
#now doing it for the contrib_data
contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' ')))
grouped_contrib <- grouped_contrib |>
mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |>
mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days"))
grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
summary(grouping_model_contrib)
summary(grouping_model_rm)
qqnorm(residuals(grouping_model_rm))
grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib)
summary(grouping_model_contrib)
qqnorm(residuals(grouping_model_contrib))
qqnorm(residuals(grouping_model_rm))
qqnorm(residuals(grouping_model_contrib))
issues_expansion <- lm(issue_mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data)
summary(issues_expansion)
govdoc_mmt <- lm(mmt ~ as.factor(has_contrib) + scaled_age, data=octo_data)
summary(govdoc_mmt)
govdoc_mmt <- lm(mmt ~ as.factor(has_readme) + scaled_age, data=octo_data)
summary(govdoc_mmt)
govdoc_issuesmmt <- lm(issue_mmt ~ as.factor(has_readme) + scaled_age, data=octo_data)
summary(govdoc_issuesmmt)
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
summary(mmt_outcome_model)
all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data)
summary(all_mmt_outcome_model)
govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
summary(govdoc_issuesmmt)

View File

@ -10,7 +10,6 @@ overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALS
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
mean(overall_data$mmt) mean(overall_data$mmt)
hist(overall_data$mmt, probability = TRUE) hist(overall_data$mmt, probability = TRUE)
@ -61,7 +60,6 @@ overall_data <- overall_data |>
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
#below are the models for the octo data, there should be analysis for each one #below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
summary(octo_mmtmodel1) summary(octo_mmtmodel1)
@ -74,15 +72,19 @@ wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme +
summary(wiki_mmtmodel1) summary(wiki_mmtmodel1)
qqnorm(residuals(wiki_mmtmodel1)) qqnorm(residuals(wiki_mmtmodel1))
#these next three are looking at mmt as an outcome of other factors #now large MMT model taking into account having contributing or README
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data)
summary(mmt_outcome_model) model_summarized <- summary(mmtmodel2)
coef <- coef(mmtmodel2)
qqnorm(residuals(mmtmodel2))
overall_data$predict <- overall_data$mmt * model_summarized$coefficients[2,1] + model_summarized$coefficients[1,1]
overall_data$SE <- predict.lm(mmtmodel2, se.fit = TRUE, level = 0.95)$se.fit
p <- ggplot(data=overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_ribbon(aes(x=mmt, ymin= predict-SE, ymax = predict+SE), fill = "green") +
geom_abline(intercept=coef[1], slope = coef[2], color = "red") +
theme_bw()
p
all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data)
summary(all_mmt_outcome_model)
govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
summary(govdoc_issuesmmt)
library(texreg) #my little "lib" library(texreg) #my little "lib"

View File

@ -8,9 +8,9 @@ try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../text_analysis/draft_readability_readme.csv") readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv") contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
head(readme_df) head(readme_df)
aggregate(readme_df[, 3:10], list(readme_df$subdir), mean) aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
#readme_df <- readme_df |> #readme_df <- readme_df |>
# mutate(coef_grouping <- as.factor(subdir)) # mutate(coef_grouping <- as.factor(subdir))
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
#summary(test_lm) #summary(test_lm)
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)

BIN
R/draft_large_lm_plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff