library(tidyverse) library(plyr) library(gridExtra) library(ggpubr) # script for the analysis of document readability metrics # readability metrics will be studied controlled by their length # gaughan@u.northwestern.edu # loading in the data try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv") head(readme_df) quantile(readme_df$reading_time) quantile(contributing_df$reading_time) aggregate(readme_df[, 3:10], list(readme_df$subdir), median) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) #getting basic stats for the readme readability median(readme_df$flesch_reading_ease) median(readme_df$linsear_write_formula) median(readme_df$mcalpine_eflaw) median(readme_df$reading_time) # establishing the color scheme subdirColors <- setNames( c('#31449c', '#4a7c85', '#c5db68') , levels(contributing_df$subdir) ) readmeSubdirColors <- setNames( c('#4a7c85', '#c5db68') , levels(readme_df$subdir) ) #plotting linsear scoring readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.35, position="identity") + xlim(-30, 30) + theme_bw() #plotting readme reading ease readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Flesch Reading Ease", y= "README Density")+ guides(fill="none", color="none")+ theme_bw() readme_reading_ease #plotting readme reading time readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Reading Time (s)", y= NULL)+ guides(fill="none", color="none")+ theme_bw() readme_reading_time_plot readme_reading_time_no_group <- ggplot(readme_df, aes(x=reading_time)) + geom_histogram(fill='forestgreen') + xlim(-5, 190) + ylab("Count of README Files") + xlab("Reading Time (s)") + ggtitle("Reading Time for README files from FLOSS Projects (n=2280)")+ guides(fill="none", color="none")+ theme_bw() readme_reading_time_no_group readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "Linsear Write Score", y= NULL)+ guides(fill="none", color="none")+ theme_bw() readme_linsear_plot readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = readmeSubdirColors) + xlim(-5, 90) + labs(x= "McAlpine EFLAW", y= NULL)+ guides(fill="none", color="none")+ theme_bw() #theme(axis.title.y=element_blank()) #plot of reading_ease #readme_df <- readme_df |> # mutate(coef_grouping <- as.factor(subdir)) #test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df) #summary(test_lm) aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) #basic stats for the contributing readability median(contributing_df$flesch_reading_ease) median(contributing_df$mcalpine_eflaw) median(contributing_df$reading_time) median(contributing_df$linsear_write_formula) # plotting contributing linsear writing formula contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + labs(x= NULL, y= NULL, fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + guides(fill="none", color="none") # plotting contributing reading time contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + geom_density(aes(fill=as.factor(subdir)), position="fill") + xlim(-5, 90) + labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", legend.position.inside = c(.93, .93), legend.justification = c("right", "top"), legend.direction = "horizontal", legend.margin = margin(6, 6, 6, 6)) contributing_reading_time_plot # plotting contributing mcalpine eflaw contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") + xlim(-5, 90) + labs(x= NULL, y= NULL, fill="RE Grouping")+ theme_bw() + theme(legend.position = "inside", legend.position.inside = c(.93, .93), legend.justification = c("right", "top"), legend.direction = "vertical", legend.margin = margin(6, 6, 6, 6)) # plotting contributing reading ease contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) + geom_density(aes(fill=as.factor(subdir)), position="fill") + scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) + labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+ xlim(-5, 90) + theme_bw() + guides(fill="none", color="none") contributing_reading_ease grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2) doctypeColors <- setNames( c('#5da2d8', '#c7756a') , c("CONTRIBUTING", "README")) readme_df$type = "README" contributing_df$type = "CONTRIBUTING" all_df = rbind(readme_df, contributing_df) length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) + geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+ scale_fill_manual(values = doctypeColors) + xlim(-10, 500) + labs( x = "Word Count", y = "Density Across Documents", fill="Document Type" ) + theme_bw() + theme(legend.position = "top") length_plot_all #grid.arrange(contributing_reading_time_plot, readme_reading_time_plot, nrow = 1)