24_deb_pkg_gov/R/documentReadabilityAnalysis.R
2024-08-24 17:04:46 -05:00

154 lines
6.5 KiB
R

library(tidyverse)
library(plyr)
library(gridExtra)
library(ggpubr)
# script for the analysis of document readability metrics
# readability metrics will be studied controlled by their length
# gaughan@u.northwestern.edu
# loading in the data
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv")
contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv")
head(readme_df)
quantile(readme_df$reading_time)
quantile(contributing_df$reading_time)
aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
#getting basic stats for the readme readability
median(readme_df$flesch_reading_ease)
median(readme_df$linsear_write_formula)
median(readme_df$mcalpine_eflaw)
median(readme_df$reading_time)
# establishing the color scheme
subdirColors <-
setNames( c('#31449c', '#4a7c85', '#c5db68')
, levels(contributing_df$subdir) )
readmeSubdirColors <-
setNames( c('#4a7c85', '#c5db68')
, levels(readme_df$subdir) )
#plotting linsear scoring
readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.35, position="identity") +
xlim(-30, 30) +
theme_bw()
#plotting readme reading ease
readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
geom_density(aes(fill=as.factor(subdir)), position="fill") +
scale_fill_manual(values = readmeSubdirColors) +
xlim(-5, 90) +
labs(x= "Flesch Reading Ease", y= "README Density")+
guides(fill="none", color="none")+
theme_bw()
readme_reading_ease
#plotting readme reading time
readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
geom_density(aes(fill=as.factor(subdir)), position="fill") +
scale_fill_manual(values = readmeSubdirColors) +
xlim(-5, 90) +
labs(x= "Reading Time (s)", y= NULL)+
guides(fill="none", color="none")+
theme_bw()
readme_reading_time_plot
readme_reading_time_no_group <- ggplot(readme_df, aes(x=reading_time)) +
geom_histogram(fill='forestgreen') +
xlim(-5, 190) +
ylab("Count of README Files") +
xlab("Reading Time (s)") +
ggtitle("Reading Time for README files from FLOSS Projects (n=2280)")+
guides(fill="none", color="none")+
theme_bw()
readme_reading_time_no_group
readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
scale_fill_manual(values = readmeSubdirColors) +
xlim(-5, 90) +
labs(x= "Linsear Write Score", y= NULL)+
guides(fill="none", color="none")+
theme_bw()
readme_linsear_plot
readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
scale_fill_manual(values = readmeSubdirColors) +
xlim(-5, 90) +
labs(x= "McAlpine EFLAW", y= NULL)+
guides(fill="none", color="none")+
theme_bw()
#theme(axis.title.y=element_blank())
#plot of reading_ease
#readme_df <- readme_df |>
# mutate(coef_grouping <- as.factor(subdir))
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
#summary(test_lm)
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
#basic stats for the contributing readability
median(contributing_df$flesch_reading_ease)
median(contributing_df$mcalpine_eflaw)
median(contributing_df$reading_time)
median(contributing_df$linsear_write_formula)
# plotting contributing linsear writing formula
contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
labs(x= NULL, y= NULL, fill="RE Grouping")+
xlim(-5, 90) +
theme_bw() +
guides(fill="none", color="none")
# plotting contributing reading time
contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
geom_density(aes(fill=as.factor(subdir)), position="fill") +
xlim(-5, 90) +
labs(x= NULL, y= NULL, fill="RE Grouping")+
theme_bw() +
theme(legend.position = "inside",
legend.position.inside = c(.93, .93),
legend.justification = c("right", "top"),
legend.direction = "horizontal",
legend.margin = margin(6, 6, 6, 6))
contributing_reading_time_plot
# plotting contributing mcalpine eflaw
contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
xlim(-5, 90) +
labs(x= NULL, y= NULL, fill="RE Grouping")+
theme_bw() +
theme(legend.position = "inside",
legend.position.inside = c(.93, .93),
legend.justification = c("right", "top"),
legend.direction = "vertical",
legend.margin = margin(6, 6, 6, 6))
# plotting contributing reading ease
contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
geom_density(aes(fill=as.factor(subdir)), position="fill") +
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+
xlim(-5, 90) +
theme_bw() +
guides(fill="none", color="none")
contributing_reading_ease
grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2)
doctypeColors <-
setNames( c('#5da2d8', '#c7756a')
, c("CONTRIBUTING", "README"))
readme_df$type = "README"
contributing_df$type = "CONTRIBUTING"
all_df = rbind(readme_df, contributing_df)
length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) +
geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+
scale_fill_manual(values = doctypeColors) +
xlim(-10, 500) +
labs(
x = "Word Count",
y = "Density Across Documents",
fill="Document Type"
) +
theme_bw() +
theme(legend.position = "top")
length_plot_all
#grid.arrange(contributing_reading_time_plot, readme_reading_time_plot, nrow = 1)