37 lines
1.1 KiB
R
37 lines
1.1 KiB
R
library(tidyverse)
|
|
library(stringr)
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(purrr)
|
|
|
|
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
|
|
|
|
modal_verb_list <- c("will", "may", "can", "shall", "must",
|
|
"ought", "do", "need", "dare",
|
|
"will not", "may not", "cannot", "shall not",
|
|
"must not", "do not", "don't", "need not",
|
|
"dare not", "won't", "can't")
|
|
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
|
|
|
|
main_df <- main_df |>
|
|
mutate(
|
|
comment_text = dplyr::coalesce(comment_text, ""), # handle NA
|
|
modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
|
|
log1p_mv = log1p(modal_verbs)
|
|
)
|
|
|
|
|
|
table(main_df$modal_verbs)
|
|
library(ggdist)
|
|
ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
|
|
stat_slabinterval() +
|
|
xlim(0, 5) +
|
|
labs(
|
|
title = "Distribution of modal_verbs by isAuthorWMF",
|
|
x = "Number of modal verbs in comment",
|
|
y = "isAuthorWMF"
|
|
) +
|
|
theme_minimal()
|