From 0ed72af495a0b1e0e5c665198c4c963869c18a2e Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login03.hyak.local>
Date: Thu, 23 Oct 2025 13:50:27 -0700
Subject: [PATCH] add scripts for other aggregation and merge tasks

---
 analysis_data/data_verification_3.R  | 45 ++++++++++++++++++++++++++--
 analysis_data/style_dict_variables.R | 36 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 analysis_data/style_dict_variables.R

diff --git a/analysis_data/data_verification_3.R b/analysis_data/data_verification_3.R
index bc02f48..c60e722 100644
--- a/analysis_data/data_verification_3.R
+++ b/analysis_data/data_verification_3.R
@@ -11,6 +11,37 @@ main_df <- read.csv(main_csv, header = TRUE)
 main_df <- main_df |>
   select(-starts_with("olmo"))
 
+#dedupe Task with changed title and duplicate entries
+first_rows <- main_df |>
+  filter(id %in% c(20846, 20847)) |>
+  distinct(id, .keep_all = TRUE)
+others <- main_df |>
+  filter(!(id %in% c(20846, 20847))) |> 
+  filter(id != 23366)
+main_df <- bind_rows(others, first_rows)
+  
+
+desc_info <- main_df %>%
+  filter(comment_type == "task_description") %>%
+  group_by(TaskPHID) %>%          
+  ungroup() %>%
+  transmute(
+    TaskPHID,
+    task_desc_author = AuthorPHID,
+    task_desc_dateClosed = as.POSIXct(date_closed, origin = "1970-01-01", tz = "UTC")
+  )
+
+#identifying comments in ADAC set
+main_df <- main_df |>
+  mutate(created = as.POSIXct(date_created, origin = "1970-01-01", tz = "UTC")) |>
+  left_join(desc_info, by = "TaskPHID") |>
+  mutate(
+    ADAC = as.integer(!is.na(task_desc_author) &
+                        AuthorPHID == task_desc_author &
+                        created < task_desc_dateClosed)
+  )
+
+
 pca_csv <- "~/analysis_data/102125_constituent_dfs/102025_total_pca_df.csv"
 pca_df <- read.csv(pca_csv, header = TRUE) 
 
@@ -45,6 +76,16 @@ large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
 small_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102125_human_info_sample.csv"
 small_human_labels_df <- read.csv(small_human_labels_csv, header = TRUE) 
 #TODO
-# [ ] collate the two samples into one 
-# [ ] aggregate sentence level rows into comment level 
+# [ x ] collate the two samples into one 
+large_human_labels_df <- large_human_labels_df |> select(id, cleaned_sentences, human_label)
+small_human_labels_df <- small_human_labels_df |> select(id, cleaned_sentences, human_label)
+human_labels_df <- rbind(large_human_labels_df, small_human_labels_df)
+# [ x ] aggregate sentence level rows into comment level
+human_labels_reduced <- human_labels_df %>%
+  group_by(id) %>%
+  summarise(
+    cleaned_sentences = list(cleaned_sentences),
+    human_labels       = list(str_squish(human_label)),
+    .groups = "drop"
+  ) 
 # [ ] merge into unified data set
\ No newline at end of file
diff --git a/analysis_data/style_dict_variables.R b/analysis_data/style_dict_variables.R
new file mode 100644
index 0000000..ff5c4cb
--- /dev/null
+++ b/analysis_data/style_dict_variables.R
@@ -0,0 +1,36 @@
+library(tidyverse)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(purrr)
+
+main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+
+modal_verb_list <- c("will", "may", "can", "shall", "must", 
+                     "ought", "do", "need", "dare",
+                     "will not", "may not", "cannot", "shall not", 
+                     "must not", "do not", "don't", "need not",
+                     "dare not", "won't", "can't")
+modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
+
+main_df <- main_df |>
+  mutate(
+    comment_text = dplyr::coalesce(comment_text, ""), # handle NA
+    modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
+    log1p_mv = log1p(modal_verbs)
+    )
+
+
+table(main_df$modal_verbs)
+library(ggdist)
+ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
+  stat_slabinterval() +
+  xlim(0, 5) + 
+  labs(
+    title = "Distribution of modal_verbs by isAuthorWMF",
+    x = "Number of modal verbs in comment",
+    y = "isAuthorWMF"
+  ) +
+  theme_minimal()