library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)

main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE) 


modal_verb_list <- c("will", "may", "can", "shall", "must", 
                     "ought", "do", "need", "dare",
                     "will not", "may not", "cannot", "shall not", 
                     "must not", "do not", "don't", "need not",
                     "dare not", "won't", "can't")
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")

main_df <- main_df |>
  mutate(
    comment_text = dplyr::coalesce(comment_text, ""), # handle NA
    modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
    log1p_mv = log1p(modal_verbs)
    )


table(main_df$modal_verbs)
library(ggdist)
ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
  stat_slabinterval() +
  xlim(0, 5) + 
  labs(
    title = "Distribution of modal_verbs by isAuthorWMF",
    x = "Number of modal verbs in comment",
    y = "isAuthorWMF"
  ) +
  theme_minimal()