From e955b4f50fa5642daa3e63bf17fa63e6312fab25 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login01.hyak.local>
Date: Fri, 24 Oct 2025 14:10:49 -0700
Subject: [PATCH] adding some analysis of modal terms and olmo labels

---
 analysis_data/style_dict_variables.R          | 195 +++++++++++++++---
 .../093025_power_dsl.csv                      |   0
 .../dsl_data_transform.R                      |   0
 dsl/dsl_aggregation.R                         |  16 ++
 4 files changed, 188 insertions(+), 23 deletions(-)
 rename dsl/{ => archived_dsl_data}/093025_power_dsl.csv (100%)
 rename dsl/{ => archived_dsl_data}/dsl_data_transform.R (100%)
 create mode 100644 dsl/dsl_aggregation.R

diff --git a/analysis_data/style_dict_variables.R b/analysis_data/style_dict_variables.R
index ff5c4cb..67e7385 100644
--- a/analysis_data/style_dict_variables.R
+++ b/analysis_data/style_dict_variables.R
@@ -4,33 +4,182 @@ library(tidyr)
 library(dplyr)
 library(purrr)
 
-main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
-main_df <- read.csv(main_csv, header = TRUE) 
+unified_csv <-"~/analysis_data/102425_unified.csv"
+unified_df <- read.csv(unified_csv, header = TRUE) 
 
+BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
+SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
 
-modal_verb_list <- c("will", "may", "can", "shall", "must", 
-                     "ought", "do", "need", "dare",
-                     "will not", "may not", "cannot", "shall not", 
-                     "must not", "do not", "don't", "need not",
-                     "dare not", "won't", "can't")
-modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
+human_list_unified_df <- unified_df %>%
+  filter(!is.na(human_labels))|>
+  mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
+  mutate(list_human_labels = map(human_labels, ~ {
+    if (is.na(.x)) {
+      NA_character_
+    } else if (str_detect(.x, '^\\s*c\\(')) {
+      eval(parse(text = .x))
+    } else {
+      .x
+    }
+  })) %>%
+  unnest(list_human_labels, keep_empty = TRUE) |>
+  filter(list_human_labels != "NA") |>
+  group_by(id)|>
+  summarise(
+    n_tags = sum(!is.na(list_human_labels)),         
+    human_BE_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    human_SOL_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    human_VR_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                                    "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    .groups = "drop"
+  ) |>
+  select(-n_tags)
 
-main_df <- main_df |>
-  mutate(
-    comment_text = dplyr::coalesce(comment_text, ""), # handle NA
-    modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
-    log1p_mv = log1p(modal_verbs)
-    )
+valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
+                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
+                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
+                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
+                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
+                      'SOCIAL CONVERSATION')
 
+unique_olmo_labels <- unified_df %>%
+  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
+  mutate(list_olmo_labels = str_extract_all(
+    olmo_sentence_labels,
+    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
+  )) %>%
+  unnest(list_olmo_labels, keep_empty = TRUE) %>%
+  # drop empty / NA / literal "NA" entries
+  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
+  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
+  filter(list_olmo_labels != "") %>%
+  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
+                             "WORKAROUND",
+                             list_olmo_labels))|>
+  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
+                             "BUG REPRODUCTION",
+                             olmo_label))|>
+  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
+                             "INVALID LABEL",
+                             olmo_label))|>
+  pull(olmo_label) %>%
+  unique() %>%
+  sort()
 
-table(main_df$modal_verbs)
+print(unique_olmo_labels)
+    
+olmo_list_unified_df <- unified_df %>%
+  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
+  mutate(list_olmo_labels = str_extract_all(
+    olmo_sentence_labels,
+    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
+  )) %>%
+  unnest(list_olmo_labels, keep_empty = TRUE) |>
+  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
+  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
+  filter(list_olmo_labels != "") %>%
+  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
+                             "WORKAROUND",
+                             list_olmo_labels))|>
+  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
+                             "BUG REPRODUCTION",
+                             olmo_label))|>
+  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
+                             "INVALID LABEL",
+                             olmo_label))|>
+  group_by(id)|>
+  summarise(
+    n_tags = sum(!is.na(olmo_label)),         
+    olmo_BE_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
+    ),
+    olmo_SOL_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
+    ),
+    olmo_VR_prop = if_else(
+      n_tags == 0L, 
+      NA_real_, 
+      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
+                             "SOLUTION DISCUSSION", "SOLUTION USAGE", 
+                             "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
+    ),
+    .groups = "drop"
+  ) |>
+  select(-n_tags)
+
+first_join <- unified_df |>
+  left_join(
+    olmo_list_unified_df,
+    by="id"
+  )
+
+second_join <- first_join |>
+  left_join(
+    human_list_unified_df,
+    by="id"
+  )
 library(ggdist)
-ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
-  stat_slabinterval() +
-  xlim(0, 5) + 
+ggplot(second_join, aes(x = olmo_VR_prop, 
+                        y = human_VR_prop,
+                        ymin = 0, ymax = 1)) +
+  facet_grid(~source, scales="fixed") +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  geom_abline() +
+  geom_smooth()+
+  xlim(0, 1) + 
+  ylim(0, 1) +
+  scale_fill_viridis_d() +
+  theme_minimal() +
   labs(
-    title = "Distribution of modal_verbs by isAuthorWMF",
-    x = "Number of modal verbs in comment",
-    y = "isAuthorWMF"
-  ) +
-  theme_minimal()
+    title = "Tags of OLMO solution % and Human solution %",
+    x = "OLMO solution % tag",
+    y = "Human solution % tag",
+  )
+
+ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
+  facet_grid(~source, scales="fixed") +
+  geom_point(shape = 19, alpha=0.3, size=2) +
+  scale_fill_viridis_d() +
+  xlim(0, 20) + 
+  theme_minimal() +
+  labs(
+    title = "Modal Verbs v. PC3",
+    x = "modal verb count",
+    y = "PC3",
+  )
+
+
+ggplot(second_join, aes(
+  x = as.factor(comment_type),    # x-axis grouping
+  y = olmo_VR_prop,
+  fill = isAuthorWMF
+)) +
+  ylim(0, 3) +
+  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
+  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
+  scale_fill_viridis_d() +
+  theme_minimal() +
+  labs(
+    title = "Boxplot of modal verb usage",
+    x = "Comment_type",
+    y = "Count of modal verbs",
+    fill = "isAuthorWMF?"
+  )
+
diff --git a/dsl/093025_power_dsl.csv b/dsl/archived_dsl_data/093025_power_dsl.csv
similarity index 100%
rename from dsl/093025_power_dsl.csv
rename to dsl/archived_dsl_data/093025_power_dsl.csv
diff --git a/dsl/dsl_data_transform.R b/dsl/archived_dsl_data/dsl_data_transform.R
similarity index 100%
rename from dsl/dsl_data_transform.R
rename to dsl/archived_dsl_data/dsl_data_transform.R
diff --git a/dsl/dsl_aggregation.R b/dsl/dsl_aggregation.R
new file mode 100644
index 0000000..ca8b6af
--- /dev/null
+++ b/dsl/dsl_aggregation.R
@@ -0,0 +1,16 @@
+library(tidyverse)
+
+unified_csv <-"~/analysis_data/102125_unified.csv"
+unified_df <- read.csv(human_csv, header = TRUE) 
+
+# 1. aggregate to the task level 
+#   1a. create human info proportions (ADAC/general)
+#   1b. create OLMO info proportions (ADAC/general)
+#   1c. 
+
+# 2. assign sampling prob for different tasks
+# refer to DSL specification sheet 
+
+# 3. check validity of different aggregate variables 
+
+# 4. save
\ No newline at end of file