library(tidyverse) library(stringr) library(tidyr) library(dplyr) library(purrr) main_csv <- "~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv, header = TRUE) modal_verb_list <- c("will", "may", "can", "shall", "must", "ought", "do", "need", "dare", "will not", "may not", "cannot", "shall not", "must not", "do not", "don't", "need not", "dare not", "won't", "can't") modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b") main_df <- main_df |> mutate( comment_text = dplyr::coalesce(comment_text, ""), # handle NA modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)), log1p_mv = log1p(modal_verbs) ) table(main_df$modal_verbs) library(ggdist) ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) + stat_slabinterval() + xlim(0, 5) + labs( title = "Distribution of modal_verbs by isAuthorWMF", x = "Number of modal verbs in comment", y = "isAuthorWMF" ) + theme_minimal()