24_deb_pkg_gov/R/documentReadabilityAnalysis.R

104 lines
4.8 KiB
R
Raw Normal View History

library(tidyverse)
library(plyr)
2024-06-22 02:39:04 +00:00
library(gridExtra)
library(ggpubr)
# script for the analysis of document readability metrics
# readability metrics will be studied controlled by their length
# gaughan@u.northwestern.edu
# loading in the data
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
readme_df <- read_csv("../text_analysis/draft_readability_readme.csv")
contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv")
head(readme_df)
2024-06-19 23:40:22 +00:00
aggregate(readme_df[, 3:10], list(readme_df$subdir), median)
2024-06-22 02:39:04 +00:00
#getting basic stats for the readme readability
median(readme_df$flesch_reading_ease)
median(readme_df$linsear_write_formula)
2024-06-22 02:39:04 +00:00
median(readme_df$mcalpine_eflaw)
median(readme_df$reading_time)
# establishing the color scheme
subdirColors <-
setNames( c('firebrick1', 'forestgreen', 'cornflowerblue')
2024-06-22 02:39:04 +00:00
, levels(contributing_df$subdir) )
#plotting linsear scoring
readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
xlim(-30, 30) +
theme_bw()
#plotting readme reading ease
readme_reading_ease <- ggplot(readme_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
scale_color_manual(values = subdirColors) +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-06-22 02:39:04 +00:00
ylab("readme density") +
guides(fill="none", color="none")+
theme_bw()
readme_reading_ease
#plotting readme reading time
readme_reading_time_plot <- ggplot(readme_df, aes(x=reading_time, group=as.factor(subdir))) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
scale_color_manual(values = subdirColors) +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-06-22 02:39:04 +00:00
ylab("readme density") +
guides(fill="none", color="none")+
theme_bw()
2024-06-24 03:10:17 +00:00
readme_linsear_plot <- ggplot(readme_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
scale_color_manual(values = subdirColors) +
xlim(-5, 30) +
ylab("readme density") +
guides(fill="none", color="none")+
theme_bw()
readme_mcalpine_eflaw <- ggplot(readme_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
scale_color_manual(values = subdirColors) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
xlim(-5, 60) +
guides(fill="none", color="none")+
theme_bw()
2024-06-22 02:39:04 +00:00
#theme(axis.title.y=element_blank())
#plot of reading_ease
#readme_df <- readme_df |>
# mutate(coef_grouping <- as.factor(subdir))
#test_lm <- lm(mcalpine_eflaw ~ word_count + as.factor(subdir),data=readme_df)
#summary(test_lm)
2024-06-19 23:40:22 +00:00
aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median)
2024-06-22 02:39:04 +00:00
#basic stats for the contributing readability
median(contributing_df$flesch_reading_ease)
median(contributing_df$mcalpine_eflaw)
median(contributing_df$reading_time)
median(contributing_df$linsear_write_formula)
2024-06-22 02:39:04 +00:00
# plotting contributing linsear writing formula
contributing_linsear_plot <- ggplot(contributing_df, aes(x=linsear_write_formula, group=as.factor(subdir))) +
2024-06-24 03:10:17 +00:00
scale_color_manual(values = subdirColors) +
2024-06-22 02:39:04 +00:00
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
2024-06-24 03:10:17 +00:00
xlim(-5, 30) +
2024-06-22 02:39:04 +00:00
guides(fill="none", color="none")+
theme_bw()
# plotting contributing reading time
contributing_reading_time_plot <- ggplot(contributing_df, aes(x=reading_time, group=as.factor(subdir))) +
2024-06-24 03:10:17 +00:00
scale_color_manual(values = subdirColors) +
2024-06-22 02:39:04 +00:00
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-06-22 02:39:04 +00:00
ylab("contributing density") +
guides(fill="none", color="none")+
theme_bw()
# plotting contributing mcalpine eflaw
contributing_mcalpine_eflaw <- ggplot(contributing_df, aes(x=mcalpine_eflaw, group=as.factor(subdir))) +
2024-06-24 03:10:17 +00:00
scale_color_manual(values = subdirColors) +
2024-06-22 02:39:04 +00:00
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
2024-06-24 03:10:17 +00:00
xlim(-5, 60) +
2024-06-22 02:39:04 +00:00
guides(fill="none", color="none")+
theme_bw()
# plotting contributing reading ease
contributing_reading_ease <- ggplot(contributing_df, aes(x=flesch_reading_ease, group=as.factor(subdir))) +
geom_density(aes(color = as.factor(subdir), fill=as.factor(subdir)), alpha=0.2, position="identity") +
scale_color_manual(values = subdirColors) +
ylab("contributing density") +
2024-06-24 03:10:17 +00:00
xlim(-5, 90) +
2024-06-22 02:39:04 +00:00
theme_bw() +
theme(legend.position = "top")
contributing_reading_ease
2024-06-24 03:10:17 +00:00
grid.arrange(contributing_reading_ease, contributing_reading_time_plot,contributing_linsear_plot, contributing_mcalpine_eflaw, readme_reading_ease, readme_reading_time_plot, readme_linsear_plot, readme_mcalpine_eflaw, nrow = 2)