adding some analysis of modal terms and olmo labels

2025-10-24 14:10:49 -07:00 · 2025-10-24 14:10:49 -07:00 · e955b4f50f
commit e955b4f50f
parent e5ca779900
4 changed files with 188 additions and 23 deletions
--- a/analysis_data/style_dict_variables.R
+++ b/analysis_data/style_dict_variables.R
@ -4,33 +4,182 @@ library(tidyr)
 library(dplyr)
 library(purrr)
-main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+unified_csv <-"~/analysis_data/102425_unified.csv"
-main_df <- read.csv(main_csv, header = TRUE) 
+unified_df <- read.csv(unified_csv, header = TRUE) 
 BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
 SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
-modal_verb_list <- c("will", "may", "can", "shall", "must", 
+human_list_unified_df <- unified_df %>%
-                     "ought", "do", "need", "dare",
+  filter(!is.na(human_labels))|>
-                     "will not", "may not", "cannot", "shall not", 
+  mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
-                     "must not", "do not", "don't", "need not",
+  mutate(list_human_labels = map(human_labels, ~ {
-                     "dare not", "won't", "can't")
+    if (is.na(.x)) {
-modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
+      NA_character_
    } else if (str_detect(.x, '^\\s*c\\(')) {
      eval(parse(text = .x))
    } else {
      .x
    }
  })) %>%
  unnest(list_human_labels, keep_empty = TRUE) |>
  filter(list_human_labels != "NA") |>
  group_by(id)|>
  summarise(
    n_tags = sum(!is.na(list_human_labels)),         
    human_BE_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    human_SOL_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    human_VR_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                                    "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                                    "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags)
-main_df <- main_df |>
+valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
-  mutate(
+                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
-    comment_text = dplyr::coalesce(comment_text, ""), # handle NA
+                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
-    modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
+                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
-    log1p_mv = log1p(modal_verbs)
+                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')
 unique_olmo_labels <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  unnest(list_olmo_labels, keep_empty = TRUE) %>%
  # drop empty / NA / literal "NA" entries
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  pull(olmo_label) %>%
  unique() %>%
  sort()
 print(unique_olmo_labels)
 olmo_list_unified_df <- unified_df %>%
  mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
  mutate(list_olmo_labels = str_extract_all(
    olmo_sentence_labels,
    "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
  )) %>%
  unnest(list_olmo_labels, keep_empty = TRUE) |>
  filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
  mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
  filter(list_olmo_labels != "") %>%
  mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
                             "WORKAROUND",
                             list_olmo_labels))|>
  mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
                             "BUG REPRODUCTION",
                             olmo_label))|>
  mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
                             "INVALID LABEL",
                             olmo_label))|>
  group_by(id)|>
  summarise(
    n_tags = sum(!is.na(olmo_label)),         
    olmo_BE_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
    ),
    olmo_SOL_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
    ),
    olmo_VR_prop = if_else(
      n_tags == 0L, 
      NA_real_, 
      mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", 
                             "SOLUTION DISCUSSION", "SOLUTION USAGE", 
                             "INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
    ),
    .groups = "drop"
  ) |>
  select(-n_tags)
 first_join <- unified_df |>
  left_join(
    olmo_list_unified_df,
    by="id"
  )
 second_join <- first_join |>
  left_join(
    human_list_unified_df,
    by="id"
  )
 library(ggdist)
 ggplot(second_join, aes(x = olmo_VR_prop, 
                        y = human_VR_prop,
                        ymin = 0, ymax = 1)) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  geom_abline() +
  geom_smooth()+
  xlim(0, 1) + 
  ylim(0, 1) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Tags of OLMO solution % and Human solution %",
    x = "OLMO solution % tag",
    y = "Human solution % tag",
  )
 ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
  facet_grid(~source, scales="fixed") +
  geom_point(shape = 19, alpha=0.3, size=2) +
  scale_fill_viridis_d() +
  xlim(0, 20) + 
  theme_minimal() +
  labs(
    title = "Modal Verbs v. PC3",
    x = "modal verb count",
    y = "PC3",
  )
-table(main_df$modal_verbs)
+ggplot(second_join, aes(
-library(ggdist)
+  x = as.factor(comment_type),    # x-axis grouping
-ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
+  y = olmo_VR_prop,
-  stat_slabinterval() +
+  fill = isAuthorWMF
-  xlim(0, 5) + 
+)) +
  ylim(0, 3) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
-    title = "Distribution of modal_verbs by isAuthorWMF",
+    title = "Boxplot of modal verb usage",
-    x = "Number of modal verbs in comment",
+    x = "Comment_type",
-    y = "isAuthorWMF"
+    y = "Count of modal verbs",
-  ) +
+    fill = "isAuthorWMF?"
-  theme_minimal()
+  )
--- a/dsl/archived_dsl_data/093025_power_dsl.csv
+++ b/dsl/archived_dsl_data/093025_power_dsl.csv
--- a/dsl/archived_dsl_data/dsl_data_transform.R
+++ b/dsl/archived_dsl_data/dsl_data_transform.R
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -0,0 +1,16 @@
 library(tidyverse)
 unified_csv <-"~/analysis_data/102125_unified.csv"
 unified_df <- read.csv(human_csv, header = TRUE) 
 # 1. aggregate to the task level 
 #   1a. create human info proportions (ADAC/general)
 #   1b. create OLMO info proportions (ADAC/general)
 #   1c. 
 # 2. assign sampling prob for different tasks
 # refer to DSL specification sheet 
 # 3. check validity of different aggregate variables 
 # 4. save