24_deb_pkg_gov/R/documentReadabilityAnalysis.R

154 lines
6.5 KiB
R
Raw Permalink Normal View History

library(tidyverse)
library(plyr)
2024-06-22 02:39:04 +00:00
library(gridExtra)
library(ggpubr)
# script for the analysis of document readability metrics
# readability metrics will be studied controlled by their length
# gaughan@u.northwestern.edu
# loading in the data
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
2024-06-28 04:40:08 +00:00
readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv")
contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv")
head(readme_df)
2024-08-24 22:04:46 +00:00
quantile(readme_df$reading_time)
quantile(contributing_df$reading_time)
2024-06-19 23:40:22 +00:00
aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
2024-06-28 04:40:08 +00:00
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
2024-06-22 02:39:04 +00:00
#getting basic stats for the readme readability
median(readme_df$flesch_reading_ease)
median(readme_df$linsear_write_formula)
2024-06-22 02:39:04 +00:00
median(readme_df$mcalpine_eflaw)
median(readme_df$reading_time)
# establishing the color scheme
subdirColors <-
2024-08-24 22:04:46 +00:00
setNames( c('#31449c', '#4a7c85', '#c5db68')
2024-06-22 02:39:04 +00:00
, levels(contributing_df$subdir) )
2024-08-24 22:04:46 +00:00
readmeSubdirColors <-
setNames( c('#4a7c85', '#c5db68')
, levels(readme_df$subdir) )
2024-06-22 02:39:04 +00:00
#plotting linsear scoring
readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.35, position="identity") +
2024-06-22 02:39:04 +00:00
xlim(-30, 30) +
theme_bw()
#plotting readme reading ease
readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(fill=as.factor(subdir)), position="fill") +
scale_fill_manual(values = readmeSubdirColors) +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-08-24 22:04:46 +00:00
labs(x= "Flesch Reading Ease", y= "README Density")+
2024-06-22 02:39:04 +00:00
guides(fill="none", color="none")+
theme_bw()
readme_reading_ease
#plotting readme reading time
readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(fill=as.factor(subdir)), position="fill") +
scale_fill_manual(values = readmeSubdirColors) +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-08-24 22:04:46 +00:00
labs(x= "Reading Time (s)", y= NULL)+
2024-06-22 02:39:04 +00:00
guides(fill="none", color="none")+
theme_bw()
2024-08-24 22:04:46 +00:00
readme_reading_time_plot
readme_reading_time_no_group <- ggplot(readme_df, aes(x=reading_time)) +
geom_histogram(fill='forestgreen') +
xlim(-5, 190) +
ylab("Count of README Files") +
xlab("Reading Time (s)") +
ggtitle("Reading Time for README files from FLOSS Projects (n=2280)")+
guides(fill="none", color="none")+
theme_bw()
readme_reading_time_no_group
2024-06-24 03:10:17 +00:00
readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
scale_fill_manual(values = readmeSubdirColors) +
xlim(-5, 90) +
labs(x= "Linsear Write Score", y= NULL)+
2024-06-24 03:10:17 +00:00
guides(fill="none", color="none")+
theme_bw()
2024-08-24 22:04:46 +00:00
readme_linsear_plot
2024-06-24 03:10:17 +00:00
readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
scale_fill_manual(values = readmeSubdirColors) +
xlim(-5, 90) +
labs(x= "McAlpine EFLAW", y= NULL)+
2024-06-24 03:10:17 +00:00
guides(fill="none", color="none")+
2024-08-24 22:04:46 +00:00
theme_bw()
2024-06-22 02:39:04 +00:00
#theme(axis.title.y=element_blank())
#plot of reading_ease
#readme_df <- readme_df |>
# mutate(coef_grouping <- as.factor(subdir))
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
#summary(test_lm)
2024-06-19 23:40:22 +00:00
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
2024-06-22 02:39:04 +00:00
#basic stats for the contributing readability
median(contributing_df$flesch_reading_ease)
median(contributing_df$mcalpine_eflaw)
median(contributing_df$reading_time)
median(contributing_df$linsear_write_formula)
2024-06-22 02:39:04 +00:00
# plotting contributing linsear writing formula
contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
labs(x= NULL, y= NULL, fill="RE Grouping")+
xlim(-5, 90) +
theme_bw() +
guides(fill="none", color="none")
2024-06-22 02:39:04 +00:00
# plotting contributing reading time
contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
geom_density(aes(fill=as.factor(subdir)), position="fill") +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-08-24 22:04:46 +00:00
labs(x= NULL, y= NULL, fill="RE Grouping")+
theme_bw() +
theme(legend.position = "inside",
legend.position.inside = c(.93, .93),
legend.justification = c("right", "top"),
legend.direction = "horizontal",
legend.margin = margin(6, 6, 6, 6))
contributing_reading_time_plot
2024-06-22 02:39:04 +00:00
# plotting contributing mcalpine eflaw
contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
geom_density(aes(fill=as.factor(subdir)), alpha=0.35, position="identity") +
xlim(-5, 90) +
labs(x= NULL, y= NULL, fill="RE Grouping")+
theme_bw() +
theme(legend.position = "inside",
legend.position.inside = c(.93, .93),
legend.justification = c("right", "top"),
legend.direction = "vertical",
legend.margin = margin(6, 6, 6, 6))
2024-06-22 02:39:04 +00:00
# plotting contributing reading ease
contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
2024-08-24 22:04:46 +00:00
geom_density(aes(fill=as.factor(subdir)), position="fill") +
scale_fill_manual(values = subdirColors, labels=c('CI < 0', '0 in CI', '0 < CI')) +
labs(x= NULL, y="CONTRIBUTING Density", fill="RE Grouping")+
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-06-22 02:39:04 +00:00
theme_bw() +
2024-08-24 22:04:46 +00:00
guides(fill="none", color="none")
2024-06-22 02:39:04 +00:00
contributing_reading_ease
2024-08-24 22:04:46 +00:00
grid.arrange(contributing_reading_ease, contributing_reading_time_plot, readme_reading_ease, readme_reading_time_plot, nrow = 2)
doctypeColors <-
setNames( c('#5da2d8', '#c7756a')
, c("CONTRIBUTING", "README"))
readme_df$type = "README"
contributing_df$type = "CONTRIBUTING"
all_df = rbind(readme_df, contributing_df)
length_plot_all <- ggplot(all_df, aes(x=word_count, group=as.factor(type))) +
geom_density(aes(fill = as.factor(type)), color = NA, alpha=0.6, position="identity")+
scale_fill_manual(values = doctypeColors) +
xlim(-10, 500) +
labs(
x = "Word Count",
y = "Density Across Documents",
fill="Document Type"
) +
theme_bw() +
theme(legend.position = "top")
length_plot_all
#grid.arrange(contributing_reading_time_plot, readme_reading_time_plot, nrow = 1)