diff --git a/R/.Rhistory b/R/.Rhistory
index adc20ad..54e4a59 100644
--- a/R/.Rhistory
+++ b/R/.Rhistory
@@ -1,502 +1,33 @@
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-theme(legend.position = "top") +
-xlim(-10, 90) +
-theme_bw()
-contributing_reading_ease
-grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
-# plotting contributing mcalpine eflaw
-contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 70) +
-guides(fill="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-theme(legend.position = "top") +
-xlim(-10, 90) +
-theme_bw()
-contributing_reading_ease
-grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
-# plotting contributing mcalpine eflaw
-contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 70) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-theme(legend.position = "top") +
-xlim(-10, 90) +
-theme_bw()
-contributing_reading_ease
-grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
-# plotting contributing linsear writing formula
-contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-30, 30) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading time
-contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing mcalpine eflaw
-contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 70) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw(legend.position = "top")
-contributing_reading_ease
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw(legend.position = "left")
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme(legend.position = "left")
-contributing_reading_ease
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme(legend.position = "left") +
-theme_bw()
-contributing_reading_ease
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw(legend.position = "left")
-contributing_reading_ease
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-opts(legend.position = "left") +
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme(legend.position = "left") +
-theme_bw()
-contributing_reading_ease
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "left")
-contributing_reading_ease
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-contributing_reading_ease
-grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2)
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-readme_reading_ease
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw()
-grid.arrange(readme_reading_ease, readme_reading_time, contributing_reading_ease, contributing_reading_time_plot, nrow = 2)
-grid.arrange(readme_reading_ease, readme_reading_time_plot, contributing_reading_ease, contributing_reading_time_plot, nrow = 2)
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-# establishing the color scheme
-irisColors <-
-setNames( c('red', 'forestgreen', 'blue')
-, levels(contributing_df$subdir)  )
-# establishing the color scheme
-subdirColors <-
-setNames( c('red', 'forestgreen', 'blue')
-, levels(contributing_df$subdir)  )
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-contributing_reading_ease
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw()
-#plot of reading_ease
-#readme_df <- readme_df |>
-#  mutate(coef_grouping <- as.factor(subdir))
-#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
-#summary(test_lm)
-aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
-readme_reading_time_plot
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-readme_reading_ease
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw()
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw() +
-theme(axis.text.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw(axis.text.y=element_blank()) +
-theme(axis.text.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 100) +
-guides(fill="none", color="none")+
-theme_bw() +
-theme(axis.text.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 110) +
-guides(fill="none", color="none")+
-theme_bw() +
-theme(axis.text.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 140) +
-guides(fill="none", color="none")+
-theme_bw() +
-theme(axis.text.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 140) +
-guides(fill="none", color="none")+
-theme_bw() +
-theme(axis.title.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 140) +
-guides(fill="none", color="none")+
-theme_bw()
-#theme(axis.title.y=element_blank())
-readme_reading_time_plot
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 140) +
-guides(fill="none", color="none")+
-theme_bw()
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-guides(fill="none", color="none")+
-theme_bw()
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2, labels = c("a)","b)"))
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-library(ggpubr)
-ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2,labels = c("a)","b)") )
-ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, ncol = 2, nrow = 2,labels = c("a)","b)") )
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-# plotting contributing reading time
-contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-ylab("readme density") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading time
-contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-ylab("contributing density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-ylab("contributing density") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-contributing_reading_ease
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-library(tidyverse)
-library(plyr)
-library(gridExtra)
-library(ggpubr)
-# script for the analysis of document readability metrics
-# readability metrics will be studied controlled by their length
-# gaughan@u.northwestern.edu
-# loading in the data
-try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
-readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
-contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
-#getting basic stats for the readme readability
-median(readme_df$flesch_reading_ease)
-median(readme_df$linsear_write_formula)
-median(contributing_df$reading_time)
-median(contributing_df$linsear_write_formula)
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-readme_reading_ease
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-# establishing the color scheme
-subdirColors <-
-setNames( c('firebrick1', 'forestgreen', 'cornflowerblue')
-, levels(contributing_df$subdir)  )
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading time
-contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-ylab("contributing density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-ylab("contributing density") +
-xlim(-10, 90) +
-theme_bw() +
-theme(legend.position = "top")
-contributing_reading_ease
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2)
-# plotting contributing linsear writing formula
-contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-30, 30) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading time
-contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 90) +
-ylab("contributing density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing mcalpine eflaw
-contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 70) +
-guides(fill="none", color="none")+
-theme_bw()
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw readme_reading_ease, readme_reading_time_plot, nrow = 2)
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, nrow = 2)
-readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-10, 30) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing linsear writing formula
-contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 30) +
-guides(fill="none", color="none")+
-theme_bw()
-readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 60) +
-guides(fill="none", color="none")+
-theme_bw()
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2)
-# plotting contributing mcalpine eflaw
-contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-scale_color_manual(values = subdirColors) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-10, 60) +
-guides(fill="none", color="none")+
-theme_bw()
-#plotting readme reading ease
-readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-5, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-#plotting readme reading time
-readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-5, 90) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-xlim(-5, 30) +
-ylab("readme density") +
-guides(fill="none", color="none")+
-theme_bw()
-readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-scale_color_manual(values = subdirColors) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-5, 60) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing linsear writing formula
-contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
-scale_color_manual(values = subdirColors) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-5, 30) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading time
-contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
-scale_color_manual(values = subdirColors) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-5, 90) +
-ylab("contributing density") +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing mcalpine eflaw
-contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
-scale_color_manual(values = subdirColors) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-xlim(-5, 60) +
-guides(fill="none", color="none")+
-theme_bw()
-# plotting contributing reading ease
-contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
-geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
-scale_color_manual(values = subdirColors) +
-ylab("contributing density") +
-xlim(-5, 90) +
-theme_bw() +
-theme(legend.position = "top")
-grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2)
+contrib_readme_model <- load(file = "final_models/0623_pop_rm_contrib.rda")
+contrib_readme_model <- load(file = "final_models/0623_pop_rm_contrib.rda")
+contrib_readme_model <- load("final_models/0623_pop_rm_contrib.rda")
+contrib_readme_model <- source("final_models/0623_pop_rm_contrib.rda")
+contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda")
+contrib_readme_model <- readRDS("final_models/0623_pop_contrib_collab.rda")
+collab_readme_model <- readRDS("final_models/0623_pop_rm_collab.rda")
+texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2,
+custom.model.names=c( 'collab','contrib.'  ),
+custom.coef.names=c('(Intercept)', 'after_introduction'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
+library(texreg)
+texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2,
+custom.model.names=c( 'collab','contrib.'  ),
+custom.coef.names=c('(Intercept)', 'after_introduction'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
+texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2,
+custom.model.names=c( 'collab','contrib.'  ),
+custom.coef.names=c('(Intercept)', 'after_introduction' 'etc'),
+texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2,
+custom.model.names=c( 'collab','contrib.'  ),
+custom.coef.names=c('(Intercept)', 'after_introduction', 'etc'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
 library(tidyverse)
 library(plyr)
 library(stringr)
 try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
 #load in data
 contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
-View(contrib_df)
-expanded_contrib_data <- expand_timeseries(contrib_df[1,])
-for (i in 2:nrow(contrib_df)){
-expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
-}
+readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
 #some expansion needs to happens for each project
 expand_timeseries <- function(project_row) {
 longer <- project_row |>
@@ -508,5 +39,474 @@ mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
 mutate(is_collab = as.numeric(str_detect(window, "collab")))
 return(longer)
 }
+expanded_readme_data <- expand_timeseries(readme_df[1,])
+for (i in 2:nrow(readme_df)){
+expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
+}
 expanded_contrib_data <- expand_timeseries(contrib_df[1,])
-View(expanded_contrib_data)
+for (i in 2:nrow(contrib_df)){
+expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
+}
+expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
+expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
+expanded_readme_data$logcount <- log(expanded_readme_data$count)
+expanded_contrib_data$logcount <- log(expanded_contrib_data$count)
+#breaking out the types of population counts
+collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
+contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
+collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
+contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
+simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc), data=collab_pop_readme)
+summary(simple_collab_readme_model)
+anova(simple_collab_readme_model, collab_readme_model)
+summary(collab_readme_model)
+#load in data
+full_df <- read_csv("../final_data/deb_full_data.csv")
+contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
+contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link")
+View(contrib_df)
+View(contrib_df)
+readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
+readme_df <- merge(full_df, readme_df, by="upstream_vcs_link")
+# age is calculated against December 11, 2023
+contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project)
+View(contrib_df)
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project)
+View(contrib_df)
+View(contrib_df)
+View(readme_df)
+readme_df <- readme_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project)
+View(readme_df)
+collab_readme_model_plus <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_date +  (after_doc| upstream_vcs_link), data=collab_pop_readme)
+#some expansion needs to happens for each project
+expand_timeseries <- function(project_row) {
+longer <- project_row |>
+pivot_longer(cols = ends_with("new"),
+names_to = "window",
+values_to = "count") |>
+unnest(count) |>
+mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
+mutate(is_collab = as.numeric(str_detect(window, "collab")))
+return(longer)
+}
+expanded_readme_data <- expand_timeseries(readme_df[1,])
+for (i in 2:nrow(readme_df)){
+expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
+}
+expanded_contrib_data <- expand_timeseries(contrib_df[1,])
+for (i in 2:nrow(contrib_df)){
+expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
+}
+expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
+expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
+expanded_readme_data$logcount <- log(expanded_readme_data$count)
+expanded_contrib_data$logcount <- log(expanded_contrib_data$count)
+#breaking out the types of population counts
+collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
+contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
+collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
+contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
+library(tidyverse)
+library(plyr)
+library(stringr)
+try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
+#load in data
+full_df <- read_csv("../final_data/deb_full_data.csv")
+contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
+View(contrib_df)
+readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
+contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link")
+readme_df <- merge(full_df, readme_df, by="upstream_vcs_link")
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) +
+mutate(event_date_days = as.Date("2024-06-24") - event_date) +
+readme_df <- readme_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project)
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) +
+mutate(event_date_days = as.Date("2024-06-24") - event_date)
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) +
+mutate(event_date_days = diff.Date(as.Date("2023-12-11"),event_date, units = "days"))
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days = diff.Date(as.Date("2023-12-11"),event_date, units = "days"))
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days = diff.Date(as.Date("2024-06-24"),event_date, units = "days"))
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days = diff.Date(as.Date("2024-06-24"),as.Date(event_date), units = "days"))
+View(contrib_df)
+View(contrib_df)
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days = diff.Date(as.Date("2024-06-24"),as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), units = "days"))
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days =
+as.numeric(
+difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S")
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+# age is calculated against December 11, 2023
+contrib_df <- contrib_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days =
+as.numeric(
+difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+units = "days")))
+View(contrib_df)
+readme_df <- readme_df |>
+mutate(start_date = as.Date("2023-12-11") - age_of_project) |>
+mutate(event_date_days =
+as.numeric(
+difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+units = "days")))
+#some expansion needs to happens for each project
+expand_timeseries <- function(project_row) {
+longer <- project_row |>
+pivot_longer(cols = ends_with("new"),
+names_to = "window",
+values_to = "count") |>
+unnest(count) |>
+mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
+mutate(is_collab = as.numeric(str_detect(window, "collab")))
+return(longer)
+}
+expanded_readme_data <- expand_timeseries(readme_df[1,])
+for (i in 2:nrow(readme_df)){
+expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
+}
+expanded_contrib_data <- expand_timeseries(contrib_df[1,])
+for (i in 2:nrow(contrib_df)){
+expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
+}
+expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
+expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
+expanded_readme_data$logcount <- log(expanded_readme_data$count)
+expanded_contrib_data$logcount <- log(expanded_contrib_data$count)
+#breaking out the types of population counts
+collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
+contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
+collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
+contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
+#import models
+library(lme4)
+library(optimx)
+collab_readme_model_plus <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_date_days +  (after_doc| upstream_vcs_link), data=collab_pop_readme)
+anova(collab_readme_model_plus, collab_readme_model)
+collab_readme_model <- readRDS("final_models/0623_pop_rm_collab.rda")
+anova(collab_readme_model_plus, collab_readme_model)
+saveRDS(collab_readme_model, "final_models/0623_pop_rm_collab_better.rda")
+summary(collab_readme_model_plus)
+summary(collab_readme_model)
+library(tidyverse)
+#things to get:
+# - delete old age column
+# - normal age, in date
+# - age from today in days
+# - delta between first commit and document in days
+#README Document updates
+#loading in new ages
+####RDD CSV
+first_commit_df <- read_csv("../062424_did_first_commit_readme.csv")
+first_commit_df_2 <- read_csv("../062424_did_first_commit_readme_2.csv")
+first_commit_df <- rbind(first_commit_df, first_commit_df_2)
+# need to first do an rbind with this data and the second file
+# check with the head of the file/size of the file
+old_rdd_readme <- read_csv("../final_data/deb_readme_did.csv")
+old_rdd_readme <- merge(old_rdd_readme, first_commit_df, by="upstream_vcs_link")
+new_rm_data <- old_rdd_readme |>
+select(-age_of_project) |>
+mutate(first_commit_dt = as.POSIXct(first_commit,
+format = "%a %b %d %H:%M:%S %Y %z")) |>
+mutate(age_in_days =
+as.numeric(
+difftime(
+as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days"))) |>
+mutate (event_gap =
+as.numeric(
+difftime(
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days")))
+View(old_rdd_readme)
+new_rm_data <- old_rdd_readme |>
+select(-c(age_of_project)) |>
+mutate(first_commit_dt = as.POSIXct(first_commit,
+format = "%a %b %d %H:%M:%S %Y %z")) |>
+mutate(age_in_days =
+as.numeric(
+difftime(
+as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days"))) |>
+mutate (event_gap =
+as.numeric(
+difftime(
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days")))
+new_rm_data <- old_rdd_readme |>
+mutate(first_commit_dt = as.POSIXct(first_commit,
+format = "%a %b %d %H:%M:%S %Y %z")) |>
+mutate(age_in_days =
+as.numeric(
+difftime(
+as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days"))) |>
+mutate (event_gap =
+as.numeric(
+difftime(
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days")))
+View(new_rm_data)
+new_rm_data <- new_rm_data |>
+select(-age_of_project)
+new_rm_data$age_of_project = NULL
+head(new_rm_data)
+write.csv(new_rm_data, file = "../final_data/deb_readme_did_updated.csv", row.names = FALSE)
+old_pop_readme <- merge(old_pop_readme, first_commit_df, by="upstream_vcs_link")
+####PopChange CSV
+old_pop_readme <- read_csv("../final_data/deb_readme_pop_change.csv")
+old_pop_readme <- merge(old_pop_readme, first_commit_df, by="upstream_vcs_link")
+new_pop_data <- old_pop_readme |>
+mutate(first_commit_dt = as.POSIXct(first_commit,
+format = "%a %b %d %H:%M:%S %Y %z")) |>
+mutate(age_in_days =
+as.numeric(
+difftime(
+as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days"))) |>
+mutate (event_gap =
+as.numeric(
+difftime(
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days")))
+new_pop_data$age_of_project = NULL
+head(new_pop_data)
+write.csv(new_pop_data, file = "../final_data/deb_readme_pop_change_updated.csv", row.names = FALSE)
+#CONTRIBUTING Document updates
+first_commit_contrib <- read_csv("../062424_did_first_commit_contrib.csv")
+####RDD CSV
+old_rdd_contrib <- read_csv("../final_data/deb_contrib_did.csv")
+old_rdd_contrib <- merge(old_rdd_contrib, first_commit_contrib, by="upstream_vcs_link")
+new_rdd_contrib_data <- old_rdd_contrib |>
+mutate(first_commit_dt = as.POSIXct(first_commit,
+format = "%a %b %d %H:%M:%S %Y %z")) |>
+mutate(age_in_days =
+as.numeric(
+difftime(
+as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days"))) |>
+mutate (event_gap =
+as.numeric(
+difftime(
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days")))
+new_rdd_contrib_data$age_of_project = NULL
+View(new_rdd_contrib_data)
+write.csv(new_rdd_contrib_data, file = "../final_data/deb_contrib_did_change_updated.csv", row.names = FALSE)
+####PopChange CSV
+old_pop_contrib <- read_csv("../final_data/deb_contrib_pop_change.csv")
+old_pop_contrib <- merge(old_pop_contrib, first_commit_contrib, by="upstream_vcs_link")
+new_pop_contrib_data <- old_pop_contrib |>
+mutate(first_commit_dt = as.POSIXct(first_commit,
+format = "%a %b %d %H:%M:%S %Y %z")) |>
+mutate(age_in_days =
+as.numeric(
+difftime(
+as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days"))) |>
+mutate (event_gap =
+as.numeric(
+difftime(
+as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"),
+first_commit_dt,
+units = "days")))
+new_pop_contrib_data$age_of_project = NULL
+write.csv(new_pop_contrib_data, file = "../final_data/deb_contrib_pop_change_updated.csv", row.names = FALSE)
+library(tidyverse)
+library(plyr)
+library(stringr)
+try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
+#load in data
+full_df <- read_csv("../final_data/deb_full_data.csv")
+contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
+readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
+contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link")
+readme_df <- merge(full_df, readme_df, by="upstream_vcs_link")
+# age is calculated against December 11, 2023
+#some expansion needs to happens for each project
+expand_timeseries <- function(project_row) {
+longer <- project_row |>
+pivot_longer(cols = ends_with("new"),
+names_to = "window",
+values_to = "count") |>
+unnest(count) |>
+mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
+mutate(is_collab = as.numeric(str_detect(window, "collab")))
+return(longer)
+}
+expanded_readme_data <- expand_timeseries(readme_df[1,])
+for (i in 2:nrow(readme_df)){
+expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
+}
+expanded_contrib_data <- expand_timeseries(contrib_df[1,])
+for (i in 2:nrow(contrib_df)){
+expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
+}
+expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
+expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
+#breaking out the types of population counts
+collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
+contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
+collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
+contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
+#import models
+library(lme4)
+library(optimx)
+library(MASS)
+simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc), data=collab_pop_readme)
+summary(simple_collab_readme_model)
+simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + age_in_days, data=collab_pop_readme)
+summary(simple_collab_readme_model)
+simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + scaled(age_in_days), data=collab_pop_readme)
+simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme)
+summary(simple_collab_readme_model)
+qqnorm(residuals(simple_collab_readme_model))
+simple_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=contrib_pop_readme)
+summary(simple_collab_readme_model)
+qqnorm(residuals(simple_collab_readme_model))
+simple_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=expanded_readme_data)
+summary(simple_collab_readme_model)
+qqnorm(residuals(simple_collab_readme_model))
+View(expanded_readme_data)
+library(tidyverse)
+library(plyr)
+library(stringr)
+try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
+#load in data
+full_df <- read_csv("../final_data/deb_full_data.csv")
+contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv")
+readme_df <- read_csv("../final_data/deb_readme_pop_change.csv")
+contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link")
+readme_df <- merge(full_df, readme_df, by="upstream_vcs_link")
+# age is calculated against December 11, 2023
+#some expansion needs to happens for each project
+expand_timeseries <- function(project_row) {
+longer <- project_row |>
+pivot_longer(cols = ends_with("new"),
+names_to = "window",
+values_to = "count") |>
+unnest(count) |>
+mutate(after_doc = as.numeric(str_detect(window, "after"))) |>
+mutate(is_collab = as.numeric(str_detect(window, "collab")))
+return(longer)
+}
+expanded_readme_data <- expand_timeseries(readme_df[1,])
+for (i in 2:nrow(readme_df)){
+expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,]))
+}
+expanded_contrib_data <- expand_timeseries(contrib_df[1,])
+for (i in 2:nrow(contrib_df)){
+expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,]))
+}
+expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count)
+expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count)
+expanded_readme_data$logcount <- log(expanded_readme_data$count)
+expanded_contrib_data$logcount <- log(expanded_contrib_data$count)
+#scale age
+expanded_readme_data$scaled_age <- scale(expanded_readme_data$age_in_days)
+expanded_contrib_data$scaled_age <- scale(expanded_contrib_data$age_in_days)
+#breaking out the types of population counts
+collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),]
+contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),]
+collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),]
+contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),]
+#import models
+library(lme4)
+library(optimx)
+library(MASS)
+simple_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=expanded_readme_data)
+summary(simple_collab_readme_model)
+simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme)
+summary(simple_collab_readme_model)
+qqnorm(residuals(simple_collab_readme_model))
+View(contrib_pop_readme)
+simple_contrib_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme)
+summary(simple_contrib_readme_model)
+qqnorm(residuals(simple_contrib_readme_model))
+View(collab_pop_readme)
+View(collab_pop_readme)
+View(contrib_pop_readme)
+#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda")
+collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib)
+#contrib docs
+simple_collab_contrib_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_contrib)
+summary(simple_collab_contrib_model)
+#readme docs
+simple_collab_readme_model <- glm.nb(log1pcount ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme)
+summary(simple_collab_readme_model)
+simple_contrib_readme_model <- glm.nb(log1pcount ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme)
+summary(simple_contrib_readme_model)
+# I don't think MLM is the right one
+collab_readme_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scaled_age + (after_doc| upstream_vcs_link), data=collab_pop_readme)
+summary(collab_readme_model)
+saveRDS(collab_readme_model, "final_models/0624_pop_rm_collab_better.rda")
+contrib_readme_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scaled_age + (after_doc| upstream_vcs_link), data=contrib_pop_readme)
+summary(collab_contrib_model)
+summary(contrib_readme_model)
+summary(collab_readme_model)
+summary(contrib_readme_model)
+saveRDS(contrib_readme_model, "final_models/0624_pop_rm_contrib.rda")
+texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2,
+custom.model.names=c( 'collab','contrib.'  ),
+custom.coef.names=c('(Intercept)', 'after_introduction', 'etc'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
+#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda")
+collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + scaled_age + (after_doc| upstream_vcs_link), data=collab_pop_contrib)
+summary(collab_contrib_model)
+contrib_pop_readme |>
+ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(after_doc))) +
+geom_violin()
+View(contrib_pop_contrib)
+#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda")
+#contributing models are not statistically significant
+contrib_contrib_model <- glm.nb(log1pcount ~ as.factor(after_doc) + event_gap , data=contrib_pop_contrib)
+summary(contrib_contrib_model)
+#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda")
+#contributing models are not statistically significant
+contrib_contrib_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_gap + (after_doc | upstream_vcs_link), data=contrib_pop_contrib)
+#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda")
+#contributing models are not statistically significant
+contrib_contrib_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scale(event_gap) + (after_doc | upstream_vcs_link), data=contrib_pop_contrib)
+summary(contrib_contrib_model)
+#all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + scaled_event_gap + (D * week_offset | upstream_vcs_link),
+#              control=glmerControl(optimizer="bobyqa",
+#                                           optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data)
+all_gmodel <- readRDS("0512_contrib_all.rda")
+summary(all_gmodel)
diff --git a/R/contrib_ranef_grouping.png b/R/contrib_ranef_grouping.png
new file mode 100644
index 0000000..2da6fe0
Binary files /dev/null and b/R/contrib_ranef_grouping.png differ
diff --git a/R/final_models/0624_contrib_all_rdd.rda b/R/final_models/0624_contrib_all_rdd.rda
new file mode 100644
index 0000000..c67a66b
Binary files /dev/null and b/R/final_models/0624_contrib_all_rdd.rda differ
diff --git a/R/051224_contrib_grouped.csv b/final_data/contrib_rdd_groupings.csv
similarity index 100%
rename from R/051224_contrib_grouped.csv
rename to final_data/contrib_rdd_groupings.csv