, c(0,1,2) ) contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g subdirColors <- setNames( c('#942e55', '#78c58a', '#9b6e29') , c(0,1,2) ) contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g subdirColors <- setNames( c('#a1a596', '#557784', '#2f6382') , c(0,1,2) ) contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g subdirColors <- setNames( c('#a3b0c9', '#101f31', '#28578d') , c(0,1,2) ) readme_g <- readme_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors) + guides(fill="none", color="none")+ theme_bw() + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") readme_g contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g subdirColors <- setNames( c('#f8f06b', '#ca7780', '#a13795') , c(0,1,2) ) contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g library(tidyverse) library(plyr) library(gridExtra) library(ggpubr) # script for the analysis of document readability metrics # readability metrics will be studied controlled by their length # gaughan@u.northwestern.edu # loading in the data try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv") # establishing the color scheme subdirColors <- setNames( c('#f8f06b', '#ca7780', '#a13795') , levels(contributing_df$subdir) ) readmeSubdirColors <- setNames( c('#ca7780', '#a13795') , levels(readme_df$subdir) ) #plotting linsear scoring readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.35, position="identity") + xlim(-30, 30) + theme_bw() #plotting readme reading ease readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Flesch Reading Ease", y= "README Density")+ guides(fill="none", color="none")+ theme_bw() readme_reading_ease #plotting readme reading time readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.8, position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Reading Time (s)", y= NULL)+ guides(fill="none", color="none")+ theme_bw() readme_reading_time_plot readme_reading_time_no_group <- ggplot(readme_df, aes(x=reading_time)) + geom_histogram(fill='forestgreen') + xlim(-5, 190) + ylab("Count of README Files") + xlab("Reading Time (s)") + ggtitle("Reading Time for README files from FLOSS Projects (n=2280)")+ guides(fill="none", color="none")+ theme_bw() readme_reading_time_no_group readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Linsear Write Score", y= NULL)+ guides(fill="none", color="none")+ theme_bw() readme_linsear_plot readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "McAlpine EFLAW", y= NULL)+ guides(fill="none", color="none")+ theme_bw() #theme(axis.title.y=element_blank()) #plot of reading_ease #readme_df <- readme_df |> # mutate(coef_grouping <- as.factor(subdir)) #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) #summary(test_lm) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) # plotting contributing linsear writing formula contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + labs(x= NULL, y= NULL, fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + guides(fill="none", color="none") # plotting contributing reading time contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + xlim(-5, 90) + labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", legend.position.inside = c(.93, .93), legend.justification = c("right", "top"), legend.direction = "horizontal", legend.margin = margin(6, 6, 6, 6)) # plotting contributing mcalpine eflaw contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + xlim(-5, 90) + labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", legend.position.inside = c(.93, .93), legend.justification = c("right", "top"), legend.direction = "vertical", legend.margin = margin(6, 6, 6, 6)) # plotting contributing reading ease contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + guides(fill="none", color="none") contributing_reading_ease grid.arrange(contributing_reading_ease, contributing_linsear_plot,contributing_mcalpine_eflaw, readme_reading_ease, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) readme_df$type = "README" contributing_df$type = "CONTRIBUTING" all_df = rbind(readme_df, contributing_df) length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.4, position="identity")+ xlim(-10, 500) + labs( x = "Word Count", y = "Density Across Documents", fill="Document Type" ) + theme_bw() + theme(legend.position = "top") length_plot_all grid.arrange(contributing_reading_ease, contributing_linsear_plot,contributing_mcalpine_eflaw, readme_reading_ease, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2) length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.4, position="identity")+ xlim(-10, 500) + labs( x = "Word Count", y = "Density Across Documents", fill="Document Type" ) + theme_bw() + theme(legend.position = "top") length_plot_all # plotting contributing reading time contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + geom_density(aes(fill=as.factor(subdir)), position="fill") + xlim(-5, 90) + labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", legend.position.inside = c(.93, .93), legend.justification = c("right", "top"), legend.direction = "horizontal", legend.margin = margin(6, 6, 6, 6)) contributing_reading_time_plot grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) #plotting readme reading time readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Reading Time (s)", y= NULL)+ guides(fill="none", color="none")+ theme_bw() #plotting readme reading ease readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Flesch Reading Ease", y= "README Density")+ guides(fill="none", color="none")+ theme_bw() # plotting contributing reading ease contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + guides(fill="none", color="none") grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) # establishing the color scheme subdirColors <- setNames( c('#68293c', '#ffcf67', '#91d8f0') , levels(contributing_df$subdir) ) readmeSubdirColors <- setNames( c('#ffcf67', '#91d8f0') , levels(readme_df$subdir) ) #plotting readme reading ease readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Flesch Reading Ease", y= "README Density")+ guides(fill="none", color="none")+ theme_bw() #plotting readme reading time readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Reading Time (s)", y= NULL)+ guides(fill="none", color="none")+ theme_bw() # plotting contributing reading time contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + geom_density(aes(fill=as.factor(subdir)), position="fill") + xlim(-5, 90) + labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", legend.position.inside = c(.93, .93), legend.justification = c("right", "top"), legend.direction = "horizontal", legend.margin = margin(6, 6, 6, 6)) # plotting contributing reading ease contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + guides(fill="none", color="none") grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R") subdirColors <- setNames( c('#31449c', '#4a7c85', '#c5db68') , c(0,1,2) ) contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g contrib_groupings <- read.csv('../final_data/deb_contrib_interaction_groupings.csv') subdirColors <- setNames( c('#31449c', '#4a7c85', '#c5db68') , c(0,1,2) ) contrib_g <- contrib_groupings |> ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + scale_color_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + theme_bw() + theme(legend.position = "top") + labs(x="RE Coefficient Rank", y="RE Coefficient Estimate", color="Estimate Grouping") contrib_g source("~/Desktop/git/24_deb_gov/R/gam_plot_documents.R") doctypeColors <- setNames( c('#4a7c85', '#c5db68') , factor(all_actions_data$document_type)) View(all_actions_data) doctypeColors <- setNames( c('#4a7c85', '#c5db68') , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_fill_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot doctypeColors <- setNames( c('#ffcf67', '#91d8f0') , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot doctypeColors <- setNames( c('#7d1b16', '#263b90') , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot doctypeColors <- setNames( c('#995223', '#2464ad') , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot doctypeColors <- setNames( c('#ba6b44', '#5d7fbd') , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot doctypeColors <- setNames( c('#5da2d8', '#c7756a') , c("CONTRIBUTING", "README")) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=count, color=factor(document_type))) + scale_y_log1p() + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), labels = round(c(expm1(0), expm1(0.5), expm1(1.0), exp,1(1.5)), 1)) time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), labels = round(c(expm1(0), expm1(0.5), expm1(1.0), exp,1(1.5)), 1)) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + labs(x="Weekly Offset", y="Log Transformed Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + labs(x="Weekly Offset", y="Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + scale_y_continuous(breaks = c(0, 0.5, 1.0, 1.5), labels = round(c(expm1(0), expm1(0.5), expm1(1.0), expm1(1.5)), 1)) + labs(x="Weekly Offset", y="Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot time_plot <- all_actions_data |> ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + labs(x="Weekly Offset", y="Commit Count", color="Document Type") + scale_color_manual(values = doctypeColors) + geom_smooth() + geom_vline(xintercept = 0)+ theme_bw() + theme(legend.position = "top") time_plot source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R") length_plot_all length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.5, position="identity")+ scale_fill_manual(values = doctypeColors) + xlim(-10, 500) + labs( x = "Word Count", y = "Density Across Documents", fill="Document Type" ) + theme_bw() + theme(legend.position = "top") length_plot_all length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+ scale_fill_manual(values = doctypeColors) + xlim(-10, 500) + labs( x = "Word Count", y = "Density Across Documents", fill="Document Type" ) + theme_bw() + theme(legend.position = "top") length_plot_all