513 lines
28 KiB
R
513 lines
28 KiB
R
mutate(crescendo_limit = ifelse(week_offset < (-4), 0, 1))|>
|
|
cor.test(crescendo_limit, count)
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- windowed_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-2), 0, 1))
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- windowed_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the presce
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
library(tidyverse)
|
|
library(plyr)
|
|
#get the contrib data instead
|
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
|
|
#some preprocessing and expansion
|
|
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
|
contrib_df <- contrib_df[,col_order]
|
|
contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
|
|
contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
|
|
contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
|
|
contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
|
|
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
|
contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
|
|
# 2 some expansion needs to happens for each project
|
|
expand_timeseries <- function(project_row) {
|
|
longer <- project_row |>
|
|
pivot_longer(cols = starts_with("ct"),
|
|
names_to = "window",
|
|
values_to = "count") |>
|
|
unnest(count)
|
|
longer$observation_type <- gsub("^.*_", "", longer$window)
|
|
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
|
longer$count <- as.numeric(longer$count)
|
|
#longer <- longer[which(longer$observation_type == "all"),]
|
|
return(longer)
|
|
}
|
|
expanded_data <- expand_timeseries(contrib_df[1,])
|
|
for (i in 2:nrow(contrib_df)){
|
|
expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
|
|
}
|
|
#filter out the windows of time that we're looking at
|
|
window_num <- 8
|
|
windowed_data <- expanded_data |>
|
|
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
|
mutate(D = ifelse(week > 27, 1, 0))
|
|
#scale the age numbers and calculate the week offset here
|
|
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
|
windowed_data$week_offset <- windowed_data$week - 27
|
|
#break out the different type of commit actions
|
|
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
|
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
|
#logging
|
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
|
# now for merge
|
|
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
|
|
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
# this is the file with the lmer multi-level rddAnalysis
|
|
library(tidyverse)
|
|
library(plyr)
|
|
# 0 loading the readme data in
|
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
|
# 1 preprocessing
|
|
#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
|
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
|
readme_df <- readme_df[,col_order]
|
|
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
|
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
|
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
|
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
|
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
|
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
|
# 2 some expansion needs to happens for each project
|
|
expand_timeseries <- function(project_row) {
|
|
longer <- project_row |>
|
|
pivot_longer(cols = starts_with("ct"),
|
|
names_to = "window",
|
|
values_to = "count") |>
|
|
unnest(count)
|
|
longer$observation_type <- gsub("^.*_", "", longer$window)
|
|
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
|
longer$count <- as.numeric(longer$count)
|
|
#longer <- longer[which(longer$observation_type == "all"),]
|
|
return(longer)
|
|
}
|
|
expanded_data <- expand_timeseries(readme_df[1,])
|
|
for (i in 2:nrow(readme_df)){
|
|
expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
|
|
}
|
|
#filter out the windows of time that we're looking at
|
|
window_num <- 8
|
|
windowed_data <- expanded_data |>
|
|
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
|
mutate(D = ifelse(week > 27, 1, 0))
|
|
#scale the age numbers
|
|
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
|
windowed_data$week_offset <- windowed_data$week - 27
|
|
#break out the different types of commit actions that are studied
|
|
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
|
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
|
#log the dependent
|
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
|
|
crescendow_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
|
|
crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
library(tidyverse)
|
|
library(plyr)
|
|
#get the contrib data instead
|
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
|
|
#some preprocessing and expansion
|
|
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
|
contrib_df <- contrib_df[,col_order]
|
|
contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
|
|
contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
|
|
contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
|
|
contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
|
|
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
|
contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
|
|
# 2 some expansion needs to happens for each project
|
|
expand_timeseries <- function(project_row) {
|
|
longer <- project_row |>
|
|
pivot_longer(cols = starts_with("ct"),
|
|
names_to = "window",
|
|
values_to = "count") |>
|
|
unnest(count)
|
|
longer$observation_type <- gsub("^.*_", "", longer$window)
|
|
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
|
longer$count <- as.numeric(longer$count)
|
|
#longer <- longer[which(longer$observation_type == "all"),]
|
|
return(longer)
|
|
}
|
|
expanded_data <- expand_timeseries(contrib_df[1,])
|
|
for (i in 2:nrow(contrib_df)){
|
|
expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
|
|
}
|
|
#filter out the windows of time that we're looking at
|
|
window_num <- 8
|
|
windowed_data <- expanded_data |>
|
|
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
|
mutate(D = ifelse(week > 27, 1, 0))
|
|
#scale the age numbers and calculate the week offset here
|
|
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
|
windowed_data$week_offset <- windowed_data$week - 27
|
|
#break out the different type of commit actions
|
|
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
|
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
|
#logging
|
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
|
# now for merge
|
|
mrg_actions_data$logged_count <- log(mrg_actions_data$count)
|
|
mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
# this is the file with the lmer multi-level rddAnalysis
|
|
library(tidyverse)
|
|
library(plyr)
|
|
# 0 loading the readme data in
|
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
|
# 1 preprocessing
|
|
#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
|
col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
|
readme_df <- readme_df[,col_order]
|
|
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
|
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
|
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
|
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
|
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
|
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
|
# 2 some expansion needs to happens for each project
|
|
expand_timeseries <- function(project_row) {
|
|
longer <- project_row |>
|
|
pivot_longer(cols = starts_with("ct"),
|
|
names_to = "window",
|
|
values_to = "count") |>
|
|
unnest(count)
|
|
longer$observation_type <- gsub("^.*_", "", longer$window)
|
|
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
|
longer$count <- as.numeric(longer$count)
|
|
#longer <- longer[which(longer$observation_type == "all"),]
|
|
return(longer)
|
|
}
|
|
expanded_data <- expand_timeseries(readme_df[1,])
|
|
for (i in 2:nrow(readme_df)){
|
|
expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
|
|
}
|
|
#filter out the windows of time that we're looking at
|
|
window_num <- 8
|
|
windowed_data <- expanded_data |>
|
|
filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
|
|
mutate(D = ifelse(week > 27, 1, 0))
|
|
#scale the age numbers
|
|
windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
|
|
windowed_data$week_offset <- windowed_data$week - 27
|
|
#break out the different types of commit actions that are studied
|
|
all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
|
|
mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
|
|
#log the dependent
|
|
all_actions_data$logged_count <- log(all_actions_data$count)
|
|
all_actions_data$log1p_count <- log1p(all_actions_data$count)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
#checking crescendo of contributions before document publication
|
|
#second window
|
|
second_windowed_data <- all_actions_data |>
|
|
filter(week_offset <= 0) |>
|
|
mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
|
|
#testing whether there's a correlation between count and the two weeks before the introduction
|
|
cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
|
|
crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
|
|
summary(crescendo_huh)
|
|
library(tidyverse)
|
|
library(plyr)
|
|
# script for the analysis of document readability metrics
|
|
# readability metrics will be studied controlled by their length
|
|
# gaughan@u.northwestern.edu
|
|
# loading in the data
|
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
|
|
contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
|
|
head(readme_df)
|
|
readme_df <- readme_df |>
|
|
mutate(coef_grouping <- as.factor(subdir))
|
|
cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
|
|
readme_df <- readme_df |>
|
|
mutate(coef_grouping <- as.factor(subdir))
|
|
cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
|
|
cor(readme_df$coef_grouping, readme_df$flesch_reading_ease)
|
|
readme_df <- readme_df |>
|
|
mutate(coef_grouping <- as.factor(subdir))
|
|
test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
|
|
readme_df <- readme_df |>
|
|
mutate(coef_grouping <- as.factor(subdir))
|
|
test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
|
|
test_lm <- lm(flesch_reading_ease ~ subdir,data=readme_df)
|
|
summary(test_lm)
|
|
test_lm <- lm(flesch_reading_ease ~ as.factor(subdir),data=readme_df)
|
|
summary(test_lm)
|
|
head(readme_df)
|
|
test_lm <- lm(flesch_reading_ease ~ char_count + as.factor(subdir),data=readme_df)
|
|
summary(test_lm)
|
|
head(readme_df)
|
|
test_lm <- lm(linsear_write_formula ~ char_count + as.factor(subdir),data=readme_df)
|
|
summary(test_lm)
|
|
head(readme_df)
|
|
test_lm <- lm(mcalpine_eflaw ~ char_count + as.factor(subdir),data=readme_df)
|
|
summary(test_lm)
|
|
test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
|
|
summary(test_lm)
|
|
aggregate(readme_df[, 3:11], list(readme_df$subdir), mean)
|
|
aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
|
|
#readme_df <- readme_df |>
|
|
# mutate(coef_grouping <- as.factor(subdir))
|
|
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
|
|
#summary(test_lm)
|
|
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean)
|
|
library(tidyverse)
|
|
library(plyr)
|
|
# script for the analysis of document readability metrics
|
|
# readability metrics will be studied controlled by their length
|
|
# gaughan@u.northwestern.edu
|
|
# loading in the data
|
|
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
|
readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
|
|
contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
|
|
head(readme_df)
|
|
aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
|
|
aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
|
|
#readme_df <- readme_df |>
|
|
# mutate(coef_grouping <- as.factor(subdir))
|
|
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
|
|
#summary(test_lm)
|
|
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
|
|
rm(list=ls())
|
|
set.seed(424242)
|
|
library(readr)
|
|
library(ggplot2)
|
|
library(tidyverse)
|
|
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
|
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
|
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
|
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
|
mean(overall_data$mmt)
|
|
hist(overall_data$mmt, probability = TRUE)
|
|
#the basic stuff for the overall data
|
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
|
mean(overall_data$mmt)
|
|
hist(overall_data$mmt, probability = TRUE)
|
|
#some new variables around age
|
|
#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
|
#table(overall_data$new.age)
|
|
#overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
|
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
|
#model
|
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
|
summary(mmtmodel1)
|
|
qqnorm(residuals(mmtmodel1))
|
|
# below this is the analysis for the octo data
|
|
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
|
table(octo_data$new.age)
|
|
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
|
mean(octo_data$mmt)
|
|
hist(octo_data$mmt)
|
|
head(octo_data)
|
|
#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
|
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
|
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
|
|
#right skewed data, need to transform
|
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
|
hist(octo_data$wiki_mmt)
|
|
#below are the models for the octo data, there should be analysis for each one
|
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(octo_mmtmodel1)
|
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(issue_mmtmodel1)
|
|
qqnorm(residuals(issue_mmtmodel1))
|
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(wiki_mmtmodel1)
|
|
#getting some of the information in about whether projects have specific files
|
|
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
|
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
|
octo_data <- octo_data |>
|
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
|
overall_data <- overall_data |>
|
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
|
#below are the models for the octo data, there should be analysis for each one
|
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(octo_mmtmodel1)
|
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(issue_mmtmodel1)
|
|
qqnorm(residuals(issue_mmtmodel1))
|
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(wiki_mmtmodel1)
|
|
qqnorm(residuals(wiki_mmtmodel1))
|
|
#these next three are looking at mmt as an outcome of other factors
|
|
mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
|
|
summary(mmt_outcome_model)
|
|
library(texreg) #my little "lib"
|
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
|
govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
|
|
summary(govdoc_issuesmmt)
|
|
View(octo_data)
|
|
octo_cleaned <- octo_data[octo_data$issue_mmt != NaN]
|
|
octo_cleaned <- octo_data[!is.nan(octo_data$issue_mmt),]
|
|
#below are the models for the octo data, there should be analysis for each one
|
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
|
|
summary(octo_mmtmodel1)
|
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
|
|
summary(issue_mmtmodel1)
|
|
qqnorm(residuals(issue_mmtmodel1))
|
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
|
|
summary(wiki_mmtmodel1)
|
|
write.csv(octo_cleaned,"cleaned_octo.csv", row.names = FALSE)
|
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
|
rm(list=ls())
|
|
set.seed(424242)
|
|
library(readr)
|
|
library(ggplot2)
|
|
library(tidyverse)
|
|
#primary analysis for cross-sectional community metrics
|
|
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
|
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
|
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
|
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
|
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
|
mean(overall_data$mmt)
|
|
#the basic stuff for the overall data
|
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
|
mean(overall_data$mmt)
|
|
hist(overall_data$mmt, probability = TRUE)
|
|
#model
|
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
|
summary(mmtmodel1)
|
|
qqnorm(residuals(mmtmodel1))
|
|
#clean octo data
|
|
octo_data <- filter(octo_data, total_contrib != 0)
|
|
#some new variables around age
|
|
#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
|
#table(overall_data$new.age)
|
|
#overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
|
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
|
#model
|
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
|
table(octo_data$new.age)
|
|
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
|
mean(octo_data$mmt)
|
|
hist(octo_data$mmt)
|
|
head(octo_data)
|
|
#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
|
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
|
#right skewed data, need to transform
|
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
|
#getting some of the information in about whether projects have specific files
|
|
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
|
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
|
octo_data <- octo_data |>
|
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
|
overall_data <- overall_data |>
|
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
|
#below are the models for the octo data, there should be analysis for each one
|
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(octo_mmtmodel1)
|
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(issue_mmtmodel1)
|
|
qqnorm(residuals(issue_mmtmodel1))
|
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
|
summary(wiki_mmtmodel1)
|
|
library(texreg) #my little "lib"
|
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
|
#now large MMT model taking into account having contributing or README
|
|
mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data)
|
|
summary(mmtmodel2)
|
|
qqnorm(residuals(mmtmodel2))
|
|
summary(mmtmodel2)
|