diff --git a/R/.Rhistory b/R/.Rhistory index adc20ad..54e4a59 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,502 +1,33 @@ -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -theme(legend.position = "top") + -xlim(-10, 90) + -theme_bw() -contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 70) + -guides(fill="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -theme(legend.position = "top") + -xlim(-10, 90) + -theme_bw() -contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 70) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -theme(legend.position = "top") + -xlim(-10, 90) + -theme_bw() -contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) -# plotting contributing linsear writing formula -contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-30, 30) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 70) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw(legend.position = "top") -contributing_reading_ease -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw(legend.position = "left") -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme(legend.position = "left") -contributing_reading_ease -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme(legend.position = "left") + -theme_bw() -contributing_reading_ease -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw(legend.position = "left") -contributing_reading_ease -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -opts(legend.position = "left") + -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme(legend.position = "left") + -theme_bw() -contributing_reading_ease -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "left") -contributing_reading_ease -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_linsear_plot, contributing_mcalpine_eflaw, contributing_reading_time_plot, nrow = 2) -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -readme_reading_ease -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw() -grid.arrange(readme_reading_ease, readme_reading_time, contributing_reading_ease, contributing_reading_time_plot, nrow = 2) -grid.arrange(readme_reading_ease, readme_reading_time_plot, contributing_reading_ease, contributing_reading_time_plot, nrow = 2) -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -# establishing the color scheme -irisColors <- -setNames( c('red', 'forestgreen', 'blue') -, levels(contributing_df$subdir) ) -# establishing the color scheme -subdirColors <- -setNames( c('red', 'forestgreen', 'blue') -, levels(contributing_df$subdir) ) -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -contributing_reading_ease -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw() -#plot of reading_ease -#readme_df <- readme_df |> -# mutate(coef_grouping <- as.factor(subdir)) -#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) -#summary(test_lm) -aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) -readme_reading_time_plot -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -readme_reading_ease -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw() -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw() + -theme(axis.text.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw(axis.text.y=element_blank()) + -theme(axis.text.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 100) + -guides(fill="none", color="none")+ -theme_bw() + -theme(axis.text.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 110) + -guides(fill="none", color="none")+ -theme_bw() + -theme(axis.text.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 140) + -guides(fill="none", color="none")+ -theme_bw() + -theme(axis.text.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 140) + -guides(fill="none", color="none")+ -theme_bw() + -theme(axis.title.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 140) + -guides(fill="none", color="none")+ -theme_bw() -#theme(axis.title.y=element_blank()) -readme_reading_time_plot -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 140) + -guides(fill="none", color="none")+ -theme_bw() -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -guides(fill="none", color="none")+ -theme_bw() -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2, labels = c("a)","b)")) -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -library(ggpubr) -ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2,labels = c("a)","b)") ) -ggarrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, ncol = 2, nrow = 2,labels = c("a)","b)") ) -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -ylab("readme density") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -ylab("contributing density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -ylab("contributing density") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -library(tidyverse) -library(plyr) -library(gridExtra) -library(ggpubr) -# script for the analysis of document readability metrics -# readability metrics will be studied controlled by their length -# gaughan@u.northwestern.edu -# loading in the data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../text_analysis/draft_readability_readme.csv") -contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv") -#getting basic stats for the readme readability -median(readme_df$flesch_reading_ease) -median(readme_df$linsear_write_formula) -median(contributing_df$reading_time) -median(contributing_df$linsear_write_formula) -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -readme_reading_ease -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -# establishing the color scheme -subdirColors <- -setNames( c('firebrick1', 'forestgreen', 'cornflowerblue') -, levels(contributing_df$subdir) ) -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -ylab("contributing density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -ylab("contributing density") + -xlim(-10, 90) + -theme_bw() + -theme(legend.position = "top") -contributing_reading_ease -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,readme_reading_ease, readme_reading_time_plot, nrow = 2) -# plotting contributing linsear writing formula -contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-30, 30) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 90) + -ylab("contributing density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 70) + -guides(fill="none", color="none")+ -theme_bw() -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw readme_reading_ease, readme_reading_time_plot, nrow = 2) -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, nrow = 2) -readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-10, 30) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing linsear writing formula -contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 30) + -guides(fill="none", color="none")+ -theme_bw() -readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 60) + -guides(fill="none", color="none")+ -theme_bw() -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -scale_color_manual(values = subdirColors) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-10, 60) + -guides(fill="none", color="none")+ -theme_bw() -#plotting readme reading ease -readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-5, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -#plotting readme reading time -readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-5, 90) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -xlim(-5, 30) + -ylab("readme density") + -guides(fill="none", color="none")+ -theme_bw() -readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -scale_color_manual(values = subdirColors) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-5, 60) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing linsear writing formula -contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + -scale_color_manual(values = subdirColors) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-5, 30) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading time -contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + -scale_color_manual(values = subdirColors) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-5, 90) + -ylab("contributing density") + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing mcalpine eflaw -contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + -scale_color_manual(values = subdirColors) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -xlim(-5, 60) + -guides(fill="none", color="none")+ -theme_bw() -# plotting contributing reading ease -contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + -geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") + -scale_color_manual(values = subdirColors) + -ylab("contributing density") + -xlim(-5, 90) + -theme_bw() + -theme(legend.position = "top") -grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) +contrib_readme_model <- load(file = "final_models/0623_pop_rm_contrib.rda") +contrib_readme_model <- load(file = "final_models/0623_pop_rm_contrib.rda") +contrib_readme_model <- load("final_models/0623_pop_rm_contrib.rda") +contrib_readme_model <- source("final_models/0623_pop_rm_contrib.rda") +contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") +contrib_readme_model <- readRDS("final_models/0623_pop_contrib_collab.rda") +collab_readme_model <- readRDS("final_models/0623_pop_rm_collab.rda") +texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, +custom.model.names=c( 'collab','contrib.' ), +custom.coef.names=c('(Intercept)', 'after_introduction'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +library(texreg) +texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, +custom.model.names=c( 'collab','contrib.' ), +custom.coef.names=c('(Intercept)', 'after_introduction'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, +custom.model.names=c( 'collab','contrib.' ), +custom.coef.names=c('(Intercept)', 'after_introduction' 'etc'), +texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, +custom.model.names=c( 'collab','contrib.' ), +custom.coef.names=c('(Intercept)', 'after_introduction', 'etc'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) library(tidyverse) library(plyr) library(stringr) try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) #load in data contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") -View(contrib_df) -expanded_contrib_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) -} +readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") #some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> @@ -508,5 +39,474 @@ mutate(after_doc = as.numeric(str_detect(window, "after"))) |> mutate(is_collab = as.numeric(str_detect(window, "collab"))) return(longer) } +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} expanded_contrib_data <- expand_timeseries(contrib_df[1,]) -View(expanded_contrib_data) +for (i in 2:nrow(contrib_df)){ +expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) +expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) +expanded_readme_data$logcount <- log(expanded_readme_data$count) +expanded_contrib_data$logcount <- log(expanded_contrib_data$count) +#breaking out the types of population counts +collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] +contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] +collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] +contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] +simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc), data=collab_pop_readme) +summary(simple_collab_readme_model) +anova(simple_collab_readme_model, collab_readme_model) +summary(collab_readme_model) +#load in data +full_df <- read_csv("../final_data/deb_full_data.csv") +contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") +contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") +View(contrib_df) +View(contrib_df) +readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") +readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") +# age is calculated against December 11, 2023 +contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) +View(contrib_df) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) +View(contrib_df) +View(contrib_df) +View(readme_df) +readme_df <- readme_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) +View(readme_df) +collab_readme_model_plus <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_date + (after_doc| upstream_vcs_link), data=collab_pop_readme) +#some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = ends_with("new"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +mutate(after_doc = as.numeric(str_detect(window, "after"))) |> +mutate(is_collab = as.numeric(str_detect(window, "collab"))) +return(longer) +} +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} +expanded_contrib_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ +expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) +expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) +expanded_readme_data$logcount <- log(expanded_readme_data$count) +expanded_contrib_data$logcount <- log(expanded_contrib_data$count) +#breaking out the types of population counts +collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] +contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] +collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] +contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] +library(tidyverse) +library(plyr) +library(stringr) +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +#load in data +full_df <- read_csv("../final_data/deb_full_data.csv") +contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") +View(contrib_df) +readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") +contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") +readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) + +mutate(event_date_days = as.Date("2024-06-24") - event_date) + +readme_df <- readme_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) + +mutate(event_date_days = as.Date("2024-06-24") - event_date) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) + +mutate(event_date_days = diff.Date(as.Date("2023-12-11"),event_date, units = "days")) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = diff.Date(as.Date("2023-12-11"),event_date, units = "days")) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = diff.Date(as.Date("2024-06-24"),event_date, units = "days")) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = diff.Date(as.Date("2024-06-24"),as.Date(event_date), units = "days")) +View(contrib_df) +View(contrib_df) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = diff.Date(as.Date("2024-06-24"),as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), units = "days")) +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = +as.numeric( +difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S") +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +# age is calculated against December 11, 2023 +contrib_df <- contrib_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = +as.numeric( +difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +units = "days"))) +View(contrib_df) +readme_df <- readme_df |> +mutate(start_date = as.Date("2023-12-11") - age_of_project) |> +mutate(event_date_days = +as.numeric( +difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +units = "days"))) +#some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = ends_with("new"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +mutate(after_doc = as.numeric(str_detect(window, "after"))) |> +mutate(is_collab = as.numeric(str_detect(window, "collab"))) +return(longer) +} +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} +expanded_contrib_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ +expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) +expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) +expanded_readme_data$logcount <- log(expanded_readme_data$count) +expanded_contrib_data$logcount <- log(expanded_contrib_data$count) +#breaking out the types of population counts +collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] +contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] +collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] +contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] +#import models +library(lme4) +library(optimx) +collab_readme_model_plus <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_date_days + (after_doc| upstream_vcs_link), data=collab_pop_readme) +anova(collab_readme_model_plus, collab_readme_model) +collab_readme_model <- readRDS("final_models/0623_pop_rm_collab.rda") +anova(collab_readme_model_plus, collab_readme_model) +saveRDS(collab_readme_model, "final_models/0623_pop_rm_collab_better.rda") +summary(collab_readme_model_plus) +summary(collab_readme_model) +library(tidyverse) +#things to get: +# - delete old age column +# - normal age, in date +# - age from today in days +# - delta between first commit and document in days +#README Document updates +#loading in new ages +####RDD CSV +first_commit_df <- read_csv("../062424_did_first_commit_readme.csv") +first_commit_df_2 <- read_csv("../062424_did_first_commit_readme_2.csv") +first_commit_df <- rbind(first_commit_df, first_commit_df_2) +# need to first do an rbind with this data and the second file +# check with the head of the file/size of the file +old_rdd_readme <- read_csv("../final_data/deb_readme_did.csv") +old_rdd_readme <- merge(old_rdd_readme, first_commit_df, by="upstream_vcs_link") +new_rm_data <- old_rdd_readme |> +select(-age_of_project) |> +mutate(first_commit_dt = as.POSIXct(first_commit, +format = "%a %b %d %H:%M:%S %Y %z")) |> +mutate(age_in_days = +as.numeric( +difftime( +as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) |> +mutate (event_gap = +as.numeric( +difftime( +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) +View(old_rdd_readme) +new_rm_data <- old_rdd_readme |> +select(-c(age_of_project)) |> +mutate(first_commit_dt = as.POSIXct(first_commit, +format = "%a %b %d %H:%M:%S %Y %z")) |> +mutate(age_in_days = +as.numeric( +difftime( +as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) |> +mutate (event_gap = +as.numeric( +difftime( +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) +new_rm_data <- old_rdd_readme |> +mutate(first_commit_dt = as.POSIXct(first_commit, +format = "%a %b %d %H:%M:%S %Y %z")) |> +mutate(age_in_days = +as.numeric( +difftime( +as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) |> +mutate (event_gap = +as.numeric( +difftime( +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) +View(new_rm_data) +new_rm_data <- new_rm_data |> +select(-age_of_project) +new_rm_data$age_of_project = NULL +head(new_rm_data) +write.csv(new_rm_data, file = "../final_data/deb_readme_did_updated.csv", row.names = FALSE) +old_pop_readme <- merge(old_pop_readme, first_commit_df, by="upstream_vcs_link") +####PopChange CSV +old_pop_readme <- read_csv("../final_data/deb_readme_pop_change.csv") +old_pop_readme <- merge(old_pop_readme, first_commit_df, by="upstream_vcs_link") +new_pop_data <- old_pop_readme |> +mutate(first_commit_dt = as.POSIXct(first_commit, +format = "%a %b %d %H:%M:%S %Y %z")) |> +mutate(age_in_days = +as.numeric( +difftime( +as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) |> +mutate (event_gap = +as.numeric( +difftime( +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) +new_pop_data$age_of_project = NULL +head(new_pop_data) +write.csv(new_pop_data, file = "../final_data/deb_readme_pop_change_updated.csv", row.names = FALSE) +#CONTRIBUTING Document updates +first_commit_contrib <- read_csv("../062424_did_first_commit_contrib.csv") +####RDD CSV +old_rdd_contrib <- read_csv("../final_data/deb_contrib_did.csv") +old_rdd_contrib <- merge(old_rdd_contrib, first_commit_contrib, by="upstream_vcs_link") +new_rdd_contrib_data <- old_rdd_contrib |> +mutate(first_commit_dt = as.POSIXct(first_commit, +format = "%a %b %d %H:%M:%S %Y %z")) |> +mutate(age_in_days = +as.numeric( +difftime( +as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) |> +mutate (event_gap = +as.numeric( +difftime( +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) +new_rdd_contrib_data$age_of_project = NULL +View(new_rdd_contrib_data) +write.csv(new_rdd_contrib_data, file = "../final_data/deb_contrib_did_change_updated.csv", row.names = FALSE) +####PopChange CSV +old_pop_contrib <- read_csv("../final_data/deb_contrib_pop_change.csv") +old_pop_contrib <- merge(old_pop_contrib, first_commit_contrib, by="upstream_vcs_link") +new_pop_contrib_data <- old_pop_contrib |> +mutate(first_commit_dt = as.POSIXct(first_commit, +format = "%a %b %d %H:%M:%S %Y %z")) |> +mutate(age_in_days = +as.numeric( +difftime( +as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) |> +mutate (event_gap = +as.numeric( +difftime( +as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), +first_commit_dt, +units = "days"))) +new_pop_contrib_data$age_of_project = NULL +write.csv(new_pop_contrib_data, file = "../final_data/deb_contrib_pop_change_updated.csv", row.names = FALSE) +library(tidyverse) +library(plyr) +library(stringr) +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +#load in data +full_df <- read_csv("../final_data/deb_full_data.csv") +contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") +readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") +contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") +readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") +# age is calculated against December 11, 2023 +#some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = ends_with("new"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +mutate(after_doc = as.numeric(str_detect(window, "after"))) |> +mutate(is_collab = as.numeric(str_detect(window, "collab"))) +return(longer) +} +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} +expanded_contrib_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ +expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) +expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) +#breaking out the types of population counts +collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] +contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] +collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] +contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] +#import models +library(lme4) +library(optimx) +library(MASS) +simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc), data=collab_pop_readme) +summary(simple_collab_readme_model) +simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + age_in_days, data=collab_pop_readme) +summary(simple_collab_readme_model) +simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + scaled(age_in_days), data=collab_pop_readme) +simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) +summary(simple_collab_readme_model) +qqnorm(residuals(simple_collab_readme_model)) +simple_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=contrib_pop_readme) +summary(simple_collab_readme_model) +qqnorm(residuals(simple_collab_readme_model)) +simple_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=expanded_readme_data) +summary(simple_collab_readme_model) +qqnorm(residuals(simple_collab_readme_model)) +View(expanded_readme_data) +library(tidyverse) +library(plyr) +library(stringr) +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +#load in data +full_df <- read_csv("../final_data/deb_full_data.csv") +contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") +readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") +contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") +readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") +# age is calculated against December 11, 2023 +#some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = ends_with("new"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +mutate(after_doc = as.numeric(str_detect(window, "after"))) |> +mutate(is_collab = as.numeric(str_detect(window, "collab"))) +return(longer) +} +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} +expanded_contrib_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ +expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) +expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) +expanded_readme_data$logcount <- log(expanded_readme_data$count) +expanded_contrib_data$logcount <- log(expanded_contrib_data$count) +#scale age +expanded_readme_data$scaled_age <- scale(expanded_readme_data$age_in_days) +expanded_contrib_data$scaled_age <- scale(expanded_contrib_data$age_in_days) +#breaking out the types of population counts +collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] +contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] +collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] +contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] +#import models +library(lme4) +library(optimx) +library(MASS) +simple_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=expanded_readme_data) +summary(simple_collab_readme_model) +simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) +summary(simple_collab_readme_model) +qqnorm(residuals(simple_collab_readme_model)) +View(contrib_pop_readme) +simple_contrib_readme_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) +summary(simple_contrib_readme_model) +qqnorm(residuals(simple_contrib_readme_model)) +View(collab_pop_readme) +View(collab_pop_readme) +View(contrib_pop_readme) +#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") +collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + (after_doc| upstream_vcs_link), data=collab_pop_contrib) +#contrib docs +simple_collab_contrib_model <- glm.nb(count ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_contrib) +summary(simple_collab_contrib_model) +#readme docs +simple_collab_readme_model <- glm.nb(log1pcount ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) +summary(simple_collab_readme_model) +simple_contrib_readme_model <- glm.nb(log1pcount ~ as.factor(after_doc) + scale(age_in_days), data=collab_pop_readme) +summary(simple_contrib_readme_model) +# I don't think MLM is the right one +collab_readme_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scaled_age + (after_doc| upstream_vcs_link), data=collab_pop_readme) +summary(collab_readme_model) +saveRDS(collab_readme_model, "final_models/0624_pop_rm_collab_better.rda") +contrib_readme_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scaled_age + (after_doc| upstream_vcs_link), data=contrib_pop_readme) +summary(collab_contrib_model) +summary(contrib_readme_model) +summary(collab_readme_model) +summary(contrib_readme_model) +saveRDS(contrib_readme_model, "final_models/0624_pop_rm_contrib.rda") +texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, +custom.model.names=c( 'collab','contrib.' ), +custom.coef.names=c('(Intercept)', 'after_introduction', 'etc'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") +collab_contrib_model <- glmer.nb(log1pcount ~ after_doc + scaled_age + (after_doc| upstream_vcs_link), data=collab_pop_contrib) +summary(collab_contrib_model) +contrib_pop_readme |> +ggplot(aes(x = after_doc, y = log1pcount, col = as.factor(after_doc))) + +geom_violin() +View(contrib_pop_contrib) +#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") +#contributing models are not statistically significant +contrib_contrib_model <- glm.nb(log1pcount ~ as.factor(after_doc) + event_gap , data=contrib_pop_contrib) +summary(contrib_contrib_model) +#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") +#contributing models are not statistically significant +contrib_contrib_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_gap + (after_doc | upstream_vcs_link), data=contrib_pop_contrib) +#contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") +#contributing models are not statistically significant +contrib_contrib_model <- glmer.nb(log1pcount ~ as.factor(after_doc) + scale(event_gap) + (after_doc | upstream_vcs_link), data=contrib_pop_contrib) +summary(contrib_contrib_model) +#all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + scaled_event_gap + (D * week_offset | upstream_vcs_link), +# control=glmerControl(optimizer="bobyqa", +# optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data) +all_gmodel <- readRDS("0512_contrib_all.rda") +summary(all_gmodel) diff --git a/R/contrib_ranef_grouping.png b/R/contrib_ranef_grouping.png new file mode 100644 index 0000000..2da6fe0 Binary files /dev/null and b/R/contrib_ranef_grouping.png differ diff --git a/R/final_models/0624_contrib_all_rdd.rda b/R/final_models/0624_contrib_all_rdd.rda new file mode 100644 index 0000000..c67a66b Binary files /dev/null and b/R/final_models/0624_contrib_all_rdd.rda differ diff --git a/R/051224_contrib_grouped.csv b/final_data/contrib_rdd_groupings.csv similarity index 100% rename from R/051224_contrib_grouped.csv rename to final_data/contrib_rdd_groupings.csv