diff --git a/.DS_Store b/.DS_Store
index 5008ddf..eba068e 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/R/.Rhistory b/R/.Rhistory
index 18bfad0..c890d26 100644
--- a/R/.Rhistory
+++ b/R/.Rhistory
@@ -1,512 +1,512 @@
-mutate(crescendo_limit = ifelse(week_offset < (-4), 0, 1))|>
-cor.test(crescendo_limit, count)
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- windowed_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-2), 0, 1))
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- windowed_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the presce
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-library(tidyverse)
-library(plyr)
-#get the contrib data instead
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
-#some preprocessing and expansion
-col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
-contrib_df <- contrib_df[,col_order]
-contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
-contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
-contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
-contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
-drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
-contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
-# 2 some expansion needs to happens for each project
-expand_timeseries <- function(project_row) {
-longer <- project_row |>
-pivot_longer(cols = starts_with("ct"),
-names_to = "window",
-values_to = "count") |>
-unnest(count)
-longer$observation_type <- gsub("^.*_", "", longer$window)
-longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
-longer$count <- as.numeric(longer$count)
-#longer <- longer[which(longer$observation_type == "all"),]
-return(longer)
-}
-expanded_data <- expand_timeseries(contrib_df[1,])
-for (i in 2:nrow(contrib_df)){
-expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
-}
-#filter out the windows of time that we're looking at
-window_num <- 8
-windowed_data <- expanded_data |>
-filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
-mutate(D = ifelse(week > 27, 1, 0))
-#scale the age numbers and calculate the week offset here
-windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
-windowed_data$week_offset <- windowed_data$week - 27
-#break out the different type of commit actions
-all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
-mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
-#logging
-all_actions_data$logged_count <- log(all_actions_data$count)
-all_actions_data$log1p_count <- log1p(all_actions_data$count)
-# now for merge
-mrg_actions_data$logged_count <- log(mrg_actions_data$count)
-mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-# this is the file with the lmer multi-level rddAnalysis
-library(tidyverse)
-library(plyr)
-# 0 loading the readme data in
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-readme_df <- read_csv("../final_data/deb_readme_did.csv")
-# 1 preprocessing
-#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
-col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
-readme_df <- readme_df[,col_order]
-readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
-readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
-readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
-readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
-drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
-readme_df = readme_df[,!(names(readme_df) %in% drop)]
-# 2 some expansion needs to happens for each project
-expand_timeseries <- function(project_row) {
-longer <- project_row |>
-pivot_longer(cols = starts_with("ct"),
-names_to = "window",
-values_to = "count") |>
-unnest(count)
-longer$observation_type <- gsub("^.*_", "", longer$window)
-longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
-longer$count <- as.numeric(longer$count)
-#longer <- longer[which(longer$observation_type == "all"),]
-return(longer)
-}
-expanded_data <- expand_timeseries(readme_df[1,])
-for (i in 2:nrow(readme_df)){
-expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
-}
-#filter out the windows of time that we're looking at
-window_num <- 8
-windowed_data <- expanded_data |>
-filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
-mutate(D = ifelse(week > 27, 1, 0))
-#scale the age numbers
-windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
-windowed_data$week_offset <- windowed_data$week - 27
-#break out the different types of commit actions that are studied
-all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
-mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
-#log the dependent
-all_actions_data$logged_count <- log(all_actions_data$count)
-all_actions_data$log1p_count <- log1p(all_actions_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
-crescendow_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
-crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-library(tidyverse)
-library(plyr)
-#get the contrib data instead
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-contrib_df <- read_csv("../final_data/deb_contrib_did.csv")
-#some preprocessing and expansion
-col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
-contrib_df <- contrib_df[,col_order]
-contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ")
-contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ")
-contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ")
-contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ")
-drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
-contrib_df = contrib_df[,!(names(contrib_df) %in% drop)]
-# 2 some expansion needs to happens for each project
-expand_timeseries <- function(project_row) {
-longer <- project_row |>
-pivot_longer(cols = starts_with("ct"),
-names_to = "window",
-values_to = "count") |>
-unnest(count)
-longer$observation_type <- gsub("^.*_", "", longer$window)
-longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
-longer$count <- as.numeric(longer$count)
-#longer <- longer[which(longer$observation_type == "all"),]
-return(longer)
-}
-expanded_data <- expand_timeseries(contrib_df[1,])
-for (i in 2:nrow(contrib_df)){
-expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,]))
-}
-#filter out the windows of time that we're looking at
-window_num <- 8
-windowed_data <- expanded_data |>
-filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
-mutate(D = ifelse(week > 27, 1, 0))
-#scale the age numbers and calculate the week offset here
-windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
-windowed_data$week_offset <- windowed_data$week - 27
-#break out the different type of commit actions
-all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
-mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
-#logging
-all_actions_data$logged_count <- log(all_actions_data$count)
-all_actions_data$log1p_count <- log1p(all_actions_data$count)
-# now for merge
-mrg_actions_data$logged_count <- log(mrg_actions_data$count)
-mrg_actions_data$log1p_count <- log1p(mrg_actions_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-crescendo_huh <- lm(count ~ crescendo_limit + week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-# this is the file with the lmer multi-level rddAnalysis
-library(tidyverse)
-library(plyr)
-# 0 loading the readme data in
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-readme_df <- read_csv("../final_data/deb_readme_did.csv")
-# 1 preprocessing
-#colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
-col_order <- c("upstream_vcs_link", "age_of_project", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new",  "after_commit_new")
-readme_df <- readme_df[,col_order]
-readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
-readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
-readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
-readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
-drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
-readme_df = readme_df[,!(names(readme_df) %in% drop)]
-# 2 some expansion needs to happens for each project
-expand_timeseries <- function(project_row) {
-longer <- project_row |>
-pivot_longer(cols = starts_with("ct"),
-names_to = "window",
-values_to = "count") |>
-unnest(count)
-longer$observation_type <- gsub("^.*_", "", longer$window)
-longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
-longer$count <- as.numeric(longer$count)
-#longer <- longer[which(longer$observation_type == "all"),]
-return(longer)
-}
-expanded_data <- expand_timeseries(readme_df[1,])
-for (i in 2:nrow(readme_df)){
-expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,]))
-}
-#filter out the windows of time that we're looking at
-window_num <- 8
-windowed_data <- expanded_data |>
-filter(week >= (27 - window_num) & week <= (27 + window_num)) |>
-mutate(D = ifelse(week > 27, 1, 0))
-#scale the age numbers
-windowed_data$scaled_project_age <- scale(windowed_data$age_of_project)
-windowed_data$week_offset <- windowed_data$week - 27
-#break out the different types of commit actions that are studied
-all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),]
-mrg_actions_data <- windowed_data[which(windowed_data$observation_type == "mrg"),]
-#log the dependent
-all_actions_data$logged_count <- log(all_actions_data$count)
-all_actions_data$log1p_count <- log1p(all_actions_data$count)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-3), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-#checking crescendo of contributions before document publication
-#second window
-second_windowed_data <- all_actions_data |>
-filter(week_offset <= 0) |>
-mutate(crescendo_limit = ifelse(week_offset < (-1), 0, 1))
-#testing whether there's a correlation between count and the two weeks before the introduction
-cor.test(second_windowed_data$crescendo_limit, second_windowed_data$count)
-crescendo_huh <- lm(count ~ crescendo_limit * week_offset, data = second_windowed_data)
-summary(crescendo_huh)
-library(tidyverse)
-library(plyr)
-# script for the analysis of document readability metrics
-# readability metrics will be studied controlled by their length
-# gaughan@u.northwestern.edu
-# loading in the data
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
-contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
+ggtitle("Posterior Predictive Density", subtitle="Non-Democracies") +
+theme_bw()
+p
+#plot of reading_ease
+p <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme_bw()
+p
+p <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme_bw()
+p
+p <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme_bw()
+p
+p <- ggplot(contributing_df, aes(x=linsear_write, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme_bw()
+p
 head(readme_df)
-readme_df <- readme_df |>
-mutate(coef_grouping <- as.factor(subdir))
-cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
-readme_df <- readme_df |>
-mutate(coef_grouping <- as.factor(subdir))
-cor.test(readme_df$coef_grouping, readme_df$flesch_reading_ease)
-cor(readme_df$coef_grouping, readme_df$flesch_reading_ease)
-readme_df <- readme_df |>
-mutate(coef_grouping <- as.factor(subdir))
-test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
-readme_df <- readme_df |>
-mutate(coef_grouping <- as.factor(subdir))
-test_lm <- lm(flesch_reading_ease ~ coef_grouping,data=readme_df)
-test_lm <- lm(flesch_reading_ease ~ subdir,data=readme_df)
-summary(test_lm)
-test_lm <- lm(flesch_reading_ease ~ as.factor(subdir),data=readme_df)
-summary(test_lm)
-head(readme_df)
-test_lm <- lm(flesch_reading_ease ~ char_count + as.factor(subdir),data=readme_df)
-summary(test_lm)
-head(readme_df)
-test_lm <- lm(linsear_write_formula ~ char_count + as.factor(subdir),data=readme_df)
-summary(test_lm)
-head(readme_df)
-test_lm <- lm(mcalpine_eflaw ~ char_count + as.factor(subdir),data=readme_df)
-summary(test_lm)
-test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
-summary(test_lm)
-aggregate(readme_df[, 3:11], list(readme_df$subdir), mean)
-aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
-#readme_df <- readme_df |>
-#  mutate(coef_grouping <- as.factor(subdir))
-#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
-#summary(test_lm)
-aggregate(contributing_df[, 3:10], list(contributing_df$subdir), mean)
-library(tidyverse)
-library(plyr)
-# script for the analysis of document readability metrics
-# readability metrics will be studied controlled by their length
-# gaughan@u.northwestern.edu
-# loading in the data
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
-contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
-head(readme_df)
-aggregate(readme_df[, 3:10], list(readme_df$subdir), mean)
-aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
+p0
+p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme_bw()
+p0
+p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-5, 30) +
+theme_bw()
+p0
+p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 30) +
+theme_bw()
+p0
+p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+p0
+p0 <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+p0
+p0 <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-300, 300) +
+theme_bw()
+p0
+p0 <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+p0
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+p0
+readme_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_y_continuous(breaks = seq(0,10,1),labels = paste(seq(0, 10, by = 1), "%", sep = ""))+
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes( color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(y = (..count..)/sum(..count..)), color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(y = (..count..)/sum(..count..), color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+y = (..count..)/sum(..count..),
+y = (..count..)/sum(..count..),
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(y = (..count..)/sum(..count..), color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="dodge") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_histogram(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+contributing_linsear_plot
+contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+contributing_linsear_plot
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+readme_linsear_plot
+contributing_linsear_plot
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+contributing_reading_time_plot
+contributing_reading_time_plot
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw()
+contributing_reading_time_plot
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 50) +
+theme_bw()
+contributing_reading_time_plot
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+theme_bw()
+contributing_reading_time_plot
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 80) +
+theme_bw()
+contributing_reading_time_plot
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw()
+contributing_reading_time_plot
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw()
+contributing_mcalpine_eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+theme_bw()
+contributing_mcalpine_eflaw
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+theme_bw()
+contributing_reading_ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw()
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+library(gridExtra)
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+# plotting contributing linsear writing formula
+contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+theme_bw(legend.position="none")
+# plotting contributing reading time
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw(legend.position="none")
+# plotting contributing mcalpine eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+theme_bw(legend.position="none")
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme(legend.position = "top") +
+xlim(-10, 90) +
+theme_bw()
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+# plotting contributing mcalpine eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+theme(legend.position="none")+
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme(legend.position = "top") +
+xlim(-10, 90) +
+theme_bw()
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+# plotting contributing mcalpine eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+guides(fill="none")+
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme(legend.position = "top") +
+xlim(-10, 90) +
+theme_bw()
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+# plotting contributing mcalpine eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+theme(legend.position = "top") +
+xlim(-10, 90) +
+theme_bw()
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+# plotting contributing linsear writing formula
+contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-30, 30) +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing reading time
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing mcalpine eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 70) +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw(legend.position = "top")
+contributing_reading_ease
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw(legend.position = "left")
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme(legend.position = "left")
+contributing_reading_ease
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme(legend.position = "left") +
+theme_bw()
+contributing_reading_ease
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw(legend.position = "left")
+contributing_reading_ease
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+opts(legend.position = "left") +
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme(legend.position = "left") +
+theme_bw()
+contributing_reading_ease
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "left")
+contributing_reading_ease
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
+readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+readme_reading_ease
+#plotting readme reading ease
+readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw()
+grid.arrange(readme_reading_ease, readme_reading_time, contributing_reading_ease, contributing_reading_time_plot, nrow = 2)
+grid.arrange(readme_reading_ease, readme_reading_time_plot, contributing_reading_ease, contributing_reading_time_plot, nrow = 2)
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
+# establishing the color scheme
+irisColors <-
+setNames( c('red', 'forestgreen', 'blue')
+, levels(contributing_df$subdir)  )
+# establishing the color scheme
+subdirColors <-
+setNames( c('red', 'forestgreen', 'blue')
+, levels(contributing_df$subdir)  )
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+contributing_reading_ease
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw()
+#plot of reading_ease
 #readme_df <- readme_df |>
 #  mutate(coef_grouping <- as.factor(subdir))
 #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
 #summary(test_lm)
 aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
-rm(list=ls())
-set.seed(424242)
-library(readr)
-library(ggplot2)
-library(tidyverse)
-overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
-overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
-octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
-readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
-overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
-mean(overall_data$mmt)
-hist(overall_data$mmt, probability = TRUE)
-#the basic stuff for the overall data
-overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
-mean(overall_data$mmt)
-hist(overall_data$mmt, probability = TRUE)
-#some new variables around age
-#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
-#table(overall_data$new.age)
-#overall_data$new.age.factor <- as.factor(overall_data$new.age)
-overall_data$scaled_age <- scale(overall_data$age_of_project)
-#model
-mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
-summary(mmtmodel1)
-qqnorm(residuals(mmtmodel1))
-# below this is the analysis for the octo data
-octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
-table(octo_data$new.age)
-octo_data$new.age.factor <- as.factor(octo_data$new.age)
-octo_data$scaled_age <- scale(octo_data$age_of_project)
-octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
-mean(octo_data$mmt)
-hist(octo_data$mmt)
-head(octo_data)
-#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
-octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
-octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
-#right skewed data, need to transform
-octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
-hist(octo_data$wiki_mmt)
-#below are the models for the octo data, there should be analysis for each one
-octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(octo_mmtmodel1)
-issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(issue_mmtmodel1)
-qqnorm(residuals(issue_mmtmodel1))
-wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(wiki_mmtmodel1)
-#getting some of the information in about whether projects have specific files
-readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
-contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
-octo_data <- octo_data |>
-mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
-mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
-overall_data <- overall_data |>
-mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
-mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
-#below are the models for the octo data, there should be analysis for each one
-octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(octo_mmtmodel1)
-issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(issue_mmtmodel1)
-qqnorm(residuals(issue_mmtmodel1))
-wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(wiki_mmtmodel1)
-qqnorm(residuals(wiki_mmtmodel1))
-#these next three are looking at mmt as an outcome of other factors
-mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data)
-summary(mmt_outcome_model)
-library(texreg) #my little "lib"
-texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
-custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
-custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
-use.packages=FALSE, table=FALSE, ci.force = TRUE)
-govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data)
-summary(govdoc_issuesmmt)
-View(octo_data)
-octo_cleaned <- octo_data[octo_data$issue_mmt != NaN]
-octo_cleaned <- octo_data[!is.nan(octo_data$issue_mmt),]
-#below are the models for the octo data, there should be analysis for each one
-octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
-summary(octo_mmtmodel1)
-issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
-summary(issue_mmtmodel1)
-qqnorm(residuals(issue_mmtmodel1))
-wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_cleaned)
-summary(wiki_mmtmodel1)
-write.csv(octo_cleaned,"cleaned_octo.csv", row.names = FALSE)
-texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
-custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
-custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
-use.packages=FALSE, table=FALSE, ci.force = TRUE)
-rm(list=ls())
-set.seed(424242)
-library(readr)
-library(ggplot2)
-library(tidyverse)
-#primary analysis for cross-sectional community metrics
-overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
-octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
-readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
-contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
-overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
-mean(overall_data$mmt)
-#the basic stuff for the overall data
-overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
-mean(overall_data$mmt)
-hist(overall_data$mmt, probability = TRUE)
-#model
-mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
-summary(mmtmodel1)
-qqnorm(residuals(mmtmodel1))
-#clean octo data
-octo_data <- filter(octo_data, total_contrib != 0)
-#some new variables around age
-#overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
-#table(overall_data$new.age)
-#overall_data$new.age.factor <- as.factor(overall_data$new.age)
-overall_data$scaled_age <- scale(overall_data$age_of_project)
-#model
-mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
-table(octo_data$new.age)
-octo_data$new.age.factor <- as.factor(octo_data$new.age)
-octo_data$scaled_age <- scale(octo_data$age_of_project)
-octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
-mean(octo_data$mmt)
-hist(octo_data$mmt)
-head(octo_data)
-#getting the mmt-equivalent for both issue activity as well as wiki contrib activity
-octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
-#right skewed data, need to transform
-octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
-#getting some of the information in about whether projects have specific files
-readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
-contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
-octo_data <- octo_data |>
-mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
-mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
-overall_data <- overall_data |>
-mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
-mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
-#below are the models for the octo data, there should be analysis for each one
-octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(octo_mmtmodel1)
-issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(issue_mmtmodel1)
-qqnorm(residuals(issue_mmtmodel1))
-wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
-summary(wiki_mmtmodel1)
-library(texreg) #my little "lib"
-texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
-custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.'  ),
-custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
-use.packages=FALSE, table=FALSE, ci.force = TRUE)
-#now large MMT model taking into account having contributing or README
-mmtmodel2 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=overall_data)
-summary(mmtmodel2)
-qqnorm(residuals(mmtmodel2))
-summary(mmtmodel2)
+readme_reading_time_plot
+#plotting readme reading ease
+readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+readme_reading_ease
+#plotting readme reading ease
+readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw()
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw() +
+theme(axis.text.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw(axis.text.y=element_blank()) +
+theme(axis.text.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 100) +
+guides(fill="none", color="none")+
+theme_bw() +
+theme(axis.text.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 110) +
+guides(fill="none", color="none")+
+theme_bw() +
+theme(axis.text.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 140) +
+guides(fill="none", color="none")+
+theme_bw() +
+theme(axis.text.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 140) +
+guides(fill="none", color="none")+
+theme_bw() +
+theme(axis.title.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 140) +
+guides(fill="none", color="none")+
+theme_bw()
+#theme(axis.title.y=element_blank())
+readme_reading_time_plot
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 140) +
+guides(fill="none", color="none")+
+theme_bw()
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+guides(fill="none", color="none")+
+theme_bw()
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2, labels = c("a)","b)"))
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
+library(ggpubr)
+ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2,labels = c("a)","b)") )
+ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, ncol = 2, nrow = 2,labels = c("a)","b)") )
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
+# plotting contributing reading time
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+ylab("readme density") +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+ylab("readme density") +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+#plotting readme reading ease
+readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+ylab("readme density") +
+guides(fill="none", color="none")+
+theme_bw()
+#plotting readme reading time
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+xlim(-10, 90) +
+ylab("readme density") +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing reading time
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+xlim(-10, 90) +
+ylab("contributing density") +
+guides(fill="none", color="none")+
+theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
+geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+scale_color_manual(values = subdirColors) +
+ylab("contributing density") +
+xlim(-10, 90) +
+theme_bw() +
+theme(legend.position = "top")
+contributing_reading_ease
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
diff --git a/R/documentReadabilityAnalysis.R b/R/documentReadabilityAnalysis.R
index 31ae3d3..50f7220 100644
--- a/R/documentReadabilityAnalysis.R
+++ b/R/documentReadabilityAnalysis.R
@@ -1,5 +1,7 @@
 library(tidyverse)
 library(plyr) 
+library(gridExtra)
+library(ggpubr)
 # script for the analysis of document readability metrics 
 # readability metrics will be studied controlled by their length
 # gaughan@u.northwestern.edu
@@ -9,8 +11,74 @@ readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
 contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
 head(readme_df)
 aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
+#getting basic stats for the readme readability 
+median(readme_df$flesch_reading_ease)
+median(readme_df$mcalpine_eflaw)
+median(readme_df$reading_time)
+# establishing the color scheme 
+subdirColors <-
+  setNames( c('red', 'forestgreen', 'blue')
+            , levels(contributing_df$subdir)  )
+
+#plotting linsear scoring
+readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  xlim(-30, 30) +
+  theme_bw()
+#plotting readme reading ease
+readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  scale_color_manual(values = subdirColors) + 
+  xlim(-10, 90) +
+  ylab("readme density") +
+  guides(fill="none", color="none")+
+  theme_bw() 
+readme_reading_ease 
+#plotting readme reading time 
+readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  scale_color_manual(values = subdirColors) + 
+  xlim(-10, 90) +
+  ylab("readme density") +
+  guides(fill="none", color="none")+
+  theme_bw() 
+#theme(axis.title.y=element_blank())
+#plot of reading_ease 
 #readme_df <- readme_df |>
 #  mutate(coef_grouping <- as.factor(subdir))
 #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
 #summary(test_lm)
 aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
+#basic stats for the contributing readability
+median(contributing_df$flesch_reading_ease)
+median(contributing_df$mcalpine_eflaw)
+median(contributing_df$reading_time)
+# plotting contributing linsear writing formula
+contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  xlim(-30, 30) +
+  guides(fill="none", color="none")+
+  theme_bw()
+# plotting contributing reading time 
+contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  xlim(-10, 90) +
+  ylab("contributing density") +
+  guides(fill="none", color="none")+
+  theme_bw()
+# plotting contributing mcalpine eflaw
+contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  xlim(-10, 70) +
+  guides(fill="none", color="none")+
+  theme_bw()
+# plotting contributing reading ease
+contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + 
+  geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
+  scale_color_manual(values = subdirColors) + 
+  ylab("contributing density") +
+  xlim(-10, 90) +
+  theme_bw() + 
+  theme(legend.position = "top") 
+contributing_reading_ease 
+grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
diff --git a/R/draft_readability_analysis_plot.png b/R/draft_readability_analysis_plot.png
new file mode 100644
index 0000000..3cb8192
Binary files /dev/null and b/R/draft_readability_analysis_plot.png differ