diff --git a/.DS_Store b/.DS_Store index 5008ddf..eba068e 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/R/.Rhistory b/R/.Rhistory index 18bfad0..c890d26 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,512 +1,512 @@ -mutate(crescendo_limit = ifelse(week_offset < (-4), 0, 1))|> -cor.test(crescendo_limit, count) -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- windowed_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-2), 0, 1)) -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- windowed_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the presce -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -library(tidyverse) -library(plyr) -#get the contrib data instead -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -contrib_df <- read_csv("../final_data/deb_contrib_did.csv") -#some preprocessing and expansion -col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -contrib_df <- contrib_df[,col_order] -contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") -contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ") -contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ") -contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -contrib_df = contrib_df[,!(names(contrib_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers and calculate the week offset here -windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) -windowed_data$week_offset <- windowed_data$week - 27 -#break out the different type of commit actions -all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] -mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#logging -all_actions_data$logged_count <- log(all_actions_data$count) -all_actions_data$log1p_count <- log1p(all_actions_data$count) -# now for merge -mrg_actions_data$logged_count <- log(mrg_actions_data$count) -mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -# this is the file with the lmer multi-level rddAnalysis -library(tidyverse) -library(plyr) -# 0 loading the readme data in -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -# 1 preprocessing -#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers -windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) -windowed_data$week_offset <- windowed_data$week - 27 -#break out the different types of commit actions that are studied -all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] -mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#log the dependent -all_actions_data$logged_count <- log(all_actions_data$count) -all_actions_data$log1p_count <- log1p(all_actions_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -lm(count ~ crescendo_limit + week_offset, data = second_windowed_data) -crescendow_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data) -crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data) -summary(crescendo_huh) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data) -summary(crescendo_huh) -library(tidyverse) -library(plyr) -#get the contrib data instead -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -contrib_df <- read_csv("../final_data/deb_contrib_did.csv") -#some preprocessing and expansion -col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -contrib_df <- contrib_df[,col_order] -contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") -contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ") -contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ") -contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -contrib_df = contrib_df[,!(names(contrib_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers and calculate the week offset here -windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) -windowed_data$week_offset <- windowed_data$week - 27 -#break out the different type of commit actions -all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] -mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#logging -all_actions_data$logged_count <- log(all_actions_data$count) -all_actions_data$log1p_count <- log1p(all_actions_data$count) -# now for merge -mrg_actions_data$logged_count <- log(mrg_actions_data$count) -mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data) -summary(crescendo_huh) -crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data) -summary(crescendo_huh) -# this is the file with the lmer multi-level rddAnalysis -library(tidyverse) -library(plyr) -# 0 loading the readme data in -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -# 1 preprocessing -#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# 2 some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -#longer <- longer[which(longer$observation_type == "all"),] -return(longer) -} -expanded_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) -} -#filter out the windows of time that we're looking at -window_num <- 8 -windowed_data <- expanded_data |> -filter(week >= (27 - window_num) & week <= (27 + window_num)) |> -mutate(D = ifelse(week > 27, 1, 0)) -#scale the age numbers -windowed_data$scaled_project_age <- scale(windowed_data$age_of_project) -windowed_data$week_offset <- windowed_data$week - 27 -#break out the different types of commit actions that are studied -all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] -mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),] -#log the dependent -all_actions_data$logged_count <- log(all_actions_data$count) -all_actions_data$log1p_count <- log1p(all_actions_data$count) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data) -summary(crescendo_huh) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data) -summary(crescendo_huh) -#checking crescendo of contributions before document publication -#second window -second_windowed_data <- all_actions_data |> -filter(week_offset <= 0) |> -mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1)) -#testing whether there's a correlation between count and the two weeks before the introduction -cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count) -crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data) -summary(crescendo_huh) -library(tidyverse) -library(plyr) -# script for the analysis of document readability metrics -# readability metrics will be studied controlled by their length -# gaughan@u.northwestern.edu -# loading in the data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../text_analysis/draft_readability_readme.csv") -contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv") +ggtitle("Posterior Predictive Density", subtitle="Non-Democracies") + +theme_bw() +p +#plot of reading_ease +p <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme_bw() +p +p <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme_bw() +p +p <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme_bw() +p +p <- ggplot(contributing_df, aes(x=linsear_write, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme_bw() +p head(readme_df) -readme_df <- readme_df |> -mutate(coef_grouping <- as.factor(subdir)) -cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease) -readme_df <- readme_df |> -mutate(coef_grouping <- as.factor(subdir)) -cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease) -cor(readme_df$coef_grouping, readme_df$flesch_reading_ease) -readme_df <- readme_df |> -mutate(coef_grouping <- as.factor(subdir)) -test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df) -readme_df <- readme_df |> -mutate(coef_grouping <- as.factor(subdir)) -test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df) -test_lm <- lm(flesch_reading_ease ~ subdir,data=readme_df) -summary(test_lm) -test_lm <- lm(flesch_reading_ease ~ as.factor(subdir),data=readme_df) -summary(test_lm) -head(readme_df) -test_lm <- lm(flesch_reading_ease ~ char_count + as.factor(subdir),data=readme_df) -summary(test_lm) -head(readme_df) -test_lm <- lm(linsear_write_formula ~ char_count + as.factor(subdir),data=readme_df) -summary(test_lm) -head(readme_df) -test_lm <- lm(mcalpine_eflaw ~ char_count + as.factor(subdir),data=readme_df) -summary(test_lm) -test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) -summary(test_lm) -aggregate(readme_df[, 3:11], list(readme_df$subdir), mean) -aggregate(readme_df[, 3:10], list(readme_df$subdir), mean) -#readme_df <- readme_df |> -# mutate(coef_grouping <- as.factor(subdir)) -#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) -#summary(test_lm) -aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean) -library(tidyverse) -library(plyr) -# script for the analysis of document readability metrics -# readability metrics will be studied controlled by their length -# gaughan@u.northwestern.edu -# loading in the data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../text_analysis/draft_readability_readme.csv") -contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv") -head(readme_df) -aggregate(readme_df[, 3:10], list(readme_df$subdir), mean) -aggregate(readme_df[, 3:10], list(readme_df$subdir), median) +p0 +p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme_bw() +p0 +p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-5, 30) + +theme_bw() +p0 +p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 30) + +theme_bw() +p0 +p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +p0 +p0 <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +p0 +p0 <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-300, 300) + +theme_bw() +p0 +p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +p0 +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +p0 +readme_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_y_continuous(breaks = seq(0,10,1),labels = paste(seq(0, 10, by = 1), "%", sep = ""))+ +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(y = (..count..)/sum(..count..)), color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(y = (..count..)/sum(..count..), color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +y = (..count..)/sum(..count..), +y = (..count..)/sum(..count..), +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(y = (..count..)/sum(..count..), color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="dodge") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +contributing_linsear_plot +contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +contributing_linsear_plot +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +readme_linsear_plot +contributing_linsear_plot +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +contributing_reading_time_plot +contributing_reading_time_plot +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw() +contributing_reading_time_plot +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 50) + +theme_bw() +contributing_reading_time_plot +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +theme_bw() +contributing_reading_time_plot +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 80) + +theme_bw() +contributing_reading_time_plot +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() +contributing_reading_time_plot +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() +contributing_mcalpine_eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +theme_bw() +contributing_mcalpine_eflaw +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +theme_bw() +contributing_reading_ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +library(gridExtra) +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +# plotting contributing linsear writing formula +contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +theme_bw(legend.position="none") +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw(legend.position="none") +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +theme_bw(legend.position="none") +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme(legend.position = "top") + +xlim(-10, 90) + +theme_bw() +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +theme(legend.position="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme(legend.position = "top") + +xlim(-10, 90) + +theme_bw() +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +guides(fill="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme(legend.position = "top") + +xlim(-10, 90) + +theme_bw() +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +theme(legend.position = "top") + +xlim(-10, 90) + +theme_bw() +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +# plotting contributing linsear writing formula +contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-30, 30) + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 70) + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw(legend.position = "top") +contributing_reading_ease +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw(legend.position = "left") +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme(legend.position = "left") +contributing_reading_ease +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme(legend.position = "left") + +theme_bw() +contributing_reading_ease +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw(legend.position = "left") +contributing_reading_ease +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +opts(legend.position = "left") + +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme(legend.position = "left") + +theme_bw() +contributing_reading_ease +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "left") +contributing_reading_ease +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +readme_reading_ease +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw() +grid.arrange(readme_reading_ease, readme_reading_time, contributing_reading_ease, contributing_reading_time_plot, nrow = 2) +grid.arrange(readme_reading_ease, readme_reading_time_plot, contributing_reading_ease, contributing_reading_time_plot, nrow = 2) +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) +# establishing the color scheme +irisColors <- +setNames( c('red', 'forestgreen', 'blue') +, levels(contributing_df$subdir) ) +# establishing the color scheme +subdirColors <- +setNames( c('red', 'forestgreen', 'blue') +, levels(contributing_df$subdir) ) +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +contributing_reading_ease +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw() +#plot of reading_ease #readme_df <- readme_df |> # mutate(coef_grouping <- as.factor(subdir)) #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) #summary(test_lm) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) -rm(list=ls()) -set.seed(424242) -library(readr) -library(ggplot2) -library(tidyverse) -overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) -overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) -octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) -readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) -overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -mean(overall_data$mmt) -hist(overall_data$mmt, probability = TRUE) -#the basic stuff for the overall data -overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -mean(overall_data$mmt) -hist(overall_data$mmt, probability = TRUE) -#some new variables around age -#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -#table(overall_data$new.age) -#overall_data$new.age.factor <- as.factor(overall_data$new.age) -overall_data$scaled_age <- scale(overall_data$age_of_project) -#model -mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) -summary(mmtmodel1) -qqnorm(residuals(mmtmodel1)) -# below this is the analysis for the octo data -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(octo_data$new.age) -octo_data$new.age.factor <- as.factor(octo_data$new.age) -octo_data$scaled_age <- scale(octo_data$age_of_project) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -head(octo_data) -#getting the mmt-equivalent for both issue activity as well as wiki contrib activity -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt) -#right skewed data, need to transform -octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$wiki_mmt) -#below are the models for the octo data, there should be analysis for each one -octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(octo_mmtmodel1) -issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(issue_mmtmodel1) -qqnorm(residuals(issue_mmtmodel1)) -wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(wiki_mmtmodel1) -#getting some of the information in about whether projects have specific files -readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) -contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) -octo_data <- octo_data |> -mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> -mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) -overall_data <- overall_data |> -mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> -mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) -#below are the models for the octo data, there should be analysis for each one -octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(octo_mmtmodel1) -issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(issue_mmtmodel1) -qqnorm(residuals(issue_mmtmodel1)) -wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(wiki_mmtmodel1) -qqnorm(residuals(wiki_mmtmodel1)) -#these next three are looking at mmt as an outcome of other factors -mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) -summary(mmt_outcome_model) -library(texreg) #my little "lib" -texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, -custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), -custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data) -summary(govdoc_issuesmmt) -View(octo_data) -octo_cleaned <- octo_data[octo_data$issue_mmt != NaN] -octo_cleaned <- octo_data[!is.nan(octo_data$issue_mmt),] -#below are the models for the octo data, there should be analysis for each one -octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned) -summary(octo_mmtmodel1) -issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned) -summary(issue_mmtmodel1) -qqnorm(residuals(issue_mmtmodel1)) -wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned) -summary(wiki_mmtmodel1) -write.csv(octo_cleaned,"cleaned_octo.csv", row.names = FALSE) -texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, -custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), -custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -rm(list=ls()) -set.seed(424242) -library(readr) -library(ggplot2) -library(tidyverse) -#primary analysis for cross-sectional community metrics -overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) -octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) -readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) -contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) -overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -mean(overall_data$mmt) -#the basic stuff for the overall data -overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -mean(overall_data$mmt) -hist(overall_data$mmt, probability = TRUE) -#model -mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) -summary(mmtmodel1) -qqnorm(residuals(mmtmodel1)) -#clean octo data -octo_data <- filter(octo_data, total_contrib != 0) -#some new variables around age -#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -#table(overall_data$new.age) -#overall_data$new.age.factor <- as.factor(overall_data$new.age) -overall_data$scaled_age <- scale(overall_data$age_of_project) -#model -mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) -table(octo_data$new.age) -octo_data$new.age.factor <- as.factor(octo_data$new.age) -octo_data$scaled_age <- scale(octo_data$age_of_project) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -head(octo_data) -#getting the mmt-equivalent for both issue activity as well as wiki contrib activity -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -#right skewed data, need to transform -octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) -#getting some of the information in about whether projects have specific files -readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) -contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) -octo_data <- octo_data |> -mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> -mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) -overall_data <- overall_data |> -mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> -mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) -#below are the models for the octo data, there should be analysis for each one -octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(octo_mmtmodel1) -issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(issue_mmtmodel1) -qqnorm(residuals(issue_mmtmodel1)) -wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) -summary(wiki_mmtmodel1) -library(texreg) #my little "lib" -texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, -custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), -custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -#now large MMT model taking into account having contributing or README -mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data) -summary(mmtmodel2) -qqnorm(residuals(mmtmodel2)) -summary(mmtmodel2) +readme_reading_time_plot +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +readme_reading_ease +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw() +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw() + +theme(axis.text.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw(axis.text.y=element_blank()) + +theme(axis.text.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 100) + +guides(fill="none", color="none")+ +theme_bw() + +theme(axis.text.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 110) + +guides(fill="none", color="none")+ +theme_bw() + +theme(axis.text.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 140) + +guides(fill="none", color="none")+ +theme_bw() + +theme(axis.text.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 140) + +guides(fill="none", color="none")+ +theme_bw() + +theme(axis.title.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 140) + +guides(fill="none", color="none")+ +theme_bw() +#theme(axis.title.y=element_blank()) +readme_reading_time_plot +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 140) + +guides(fill="none", color="none")+ +theme_bw() +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +guides(fill="none", color="none")+ +theme_bw() +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2, labels = c("a)","b)")) +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) +library(ggpubr) +ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2,labels = c("a)","b)") ) +ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, ncol = 2, nrow = 2,labels = c("a)","b)") ) +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +ylab("readme density") + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +ylab("readme density") + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +ylab("readme density") + +guides(fill="none", color="none")+ +theme_bw() +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +xlim(-10, 90) + +ylab("readme density") + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +xlim(-10, 90) + +ylab("contributing density") + +guides(fill="none", color="none")+ +theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + +geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + +scale_color_manual(values = subdirColors) + +ylab("contributing density") + +xlim(-10, 90) + +theme_bw() + +theme(legend.position = "top") +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) diff --git a/R/documentReadabilityAnalysis.R b/R/documentReadabilityAnalysis.R index 31ae3d3..50f7220 100644 --- a/R/documentReadabilityAnalysis.R +++ b/R/documentReadabilityAnalysis.R @@ -1,5 +1,7 @@ library(tidyverse) library(plyr) +library(gridExtra) +library(ggpubr) # script for the analysis of document readability metrics # readability metrics will be studied controlled by their length # gaughan@u.northwestern.edu @@ -9,8 +11,74 @@ readme_df <- read_csv("../text_analysis/draft_readability_readme.csv") contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv") head(readme_df) aggregate(readme_df[, 3:10], list(readme_df$subdir), median) +#getting basic stats for the readme readability +median(readme_df$flesch_reading_ease) +median(readme_df$mcalpine_eflaw) +median(readme_df$reading_time) +# establishing the color scheme +subdirColors <- + setNames( c('red', 'forestgreen', 'blue') + , levels(contributing_df$subdir) ) + +#plotting linsear scoring +readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + xlim(-30, 30) + + theme_bw() +#plotting readme reading ease +readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + scale_color_manual(values = subdirColors) + + xlim(-10, 90) + + ylab("readme density") + + guides(fill="none", color="none")+ + theme_bw() +readme_reading_ease +#plotting readme reading time +readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + scale_color_manual(values = subdirColors) + + xlim(-10, 90) + + ylab("readme density") + + guides(fill="none", color="none")+ + theme_bw() +#theme(axis.title.y=element_blank()) +#plot of reading_ease #readme_df <- readme_df |> # mutate(coef_grouping <- as.factor(subdir)) #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) #summary(test_lm) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) +#basic stats for the contributing readability +median(contributing_df$flesch_reading_ease) +median(contributing_df$mcalpine_eflaw) +median(contributing_df$reading_time) +# plotting contributing linsear writing formula +contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + xlim(-30, 30) + + guides(fill="none", color="none")+ + theme_bw() +# plotting contributing reading time +contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + xlim(-10, 90) + + ylab("contributing density") + + guides(fill="none", color="none")+ + theme_bw() +# plotting contributing mcalpine eflaw +contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + xlim(-10, 70) + + guides(fill="none", color="none")+ + theme_bw() +# plotting contributing reading ease +contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + + scale_color_manual(values = subdirColors) + + ylab("contributing density") + + xlim(-10, 90) + + theme_bw() + + theme(legend.position = "top") +contributing_reading_ease +grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) diff --git a/R/draft_readability_analysis_plot.png b/R/draft_readability_analysis_plot.png new file mode 100644 index 0000000..3cb8192 Binary files /dev/null and b/R/draft_readability_analysis_plot.png differ