From e955b4f50fa5642daa3e63bf17fa63e6312fab25 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Fri, 24 Oct 2025 14:10:49 -0700 Subject: [PATCH] adding some analysis of modal terms and olmo labels --- analysis_data/style_dict_variables.R | 195 +++++++++++++++--- .../093025_power_dsl.csv | 0 .../dsl_data_transform.R | 0 dsl/dsl_aggregation.R | 16 ++ 4 files changed, 188 insertions(+), 23 deletions(-) rename dsl/{ => archived_dsl_data}/093025_power_dsl.csv (100%) rename dsl/{ => archived_dsl_data}/dsl_data_transform.R (100%) create mode 100644 dsl/dsl_aggregation.R diff --git a/analysis_data/style_dict_variables.R b/analysis_data/style_dict_variables.R index ff5c4cb..67e7385 100644 --- a/analysis_data/style_dict_variables.R +++ b/analysis_data/style_dict_variables.R @@ -4,33 +4,182 @@ library(tidyr) library(dplyr) library(purrr) -main_csv <- "~/analysis_data/100625_unified_w_affil.csv" -main_df <- read.csv(main_csv, header = TRUE) +unified_csv <-"~/analysis_data/102425_unified.csv" +unified_df <- read.csv(unified_csv, header = TRUE) +BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR") +SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE") -modal_verb_list <- c("will", "may", "can", "shall", "must", - "ought", "do", "need", "dare", - "will not", "may not", "cannot", "shall not", - "must not", "do not", "don't", "need not", - "dare not", "won't", "can't") -modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b") +human_list_unified_df <- unified_df %>% + filter(!is.na(human_labels))|> + mutate(human_labels = tidyr::replace_na(human_labels, ""))|> + mutate(list_human_labels = map(human_labels, ~ { + if (is.na(.x)) { + NA_character_ + } else if (str_detect(.x, '^\\s*c\\(')) { + eval(parse(text = .x)) + } else { + .x + } + })) %>% + unnest(list_human_labels, keep_empty = TRUE) |> + filter(list_human_labels != "NA") |> + group_by(id)|> + summarise( + n_tags = sum(!is.na(list_human_labels)), + human_BE_prop = if_else( + n_tags == 0L, + NA_real_, + mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE) + ), + human_SOL_prop = if_else( + n_tags == 0L, + NA_real_, + mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE) + ), + human_VR_prop = if_else( + n_tags == 0L, + NA_real_, + mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", + "SOLUTION DISCUSSION", "SOLUTION USAGE", + "INVESTIGATION AND EXPLORATION"), na.rm = TRUE) + ), + .groups = "drop" + ) |> + select(-n_tags) -main_df <- main_df |> - mutate( - comment_text = dplyr::coalesce(comment_text, ""), # handle NA - modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)), - log1p_mv = log1p(modal_verbs) - ) +valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR', + 'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION', + 'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', + 'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE', + 'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE', + 'SOCIAL CONVERSATION') +unique_olmo_labels <- unified_df %>% + mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>% + mutate(list_olmo_labels = str_extract_all( + olmo_sentence_labels, + "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")" + )) %>% + unnest(list_olmo_labels, keep_empty = TRUE) %>% + # drop empty / NA / literal "NA" entries + filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>% + mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>% + filter(list_olmo_labels != "") %>% + mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"), + "WORKAROUND", + list_olmo_labels))|> + mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"), + "BUG REPRODUCTION", + olmo_label))|> + mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories), + "INVALID LABEL", + olmo_label))|> + pull(olmo_label) %>% + unique() %>% + sort() -table(main_df$modal_verbs) +print(unique_olmo_labels) + +olmo_list_unified_df <- unified_df %>% + mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|> + mutate(list_olmo_labels = str_extract_all( + olmo_sentence_labels, + "(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")" + )) %>% + unnest(list_olmo_labels, keep_empty = TRUE) |> + filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>% + mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>% + filter(list_olmo_labels != "") %>% + mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"), + "WORKAROUND", + list_olmo_labels))|> + mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"), + "BUG REPRODUCTION", + olmo_label))|> + mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories), + "INVALID LABEL", + olmo_label))|> + group_by(id)|> + summarise( + n_tags = sum(!is.na(olmo_label)), + olmo_BE_prop = if_else( + n_tags == 0L, + NA_real_, + mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE) + ), + olmo_SOL_prop = if_else( + n_tags == 0L, + NA_real_, + mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE) + ), + olmo_VR_prop = if_else( + n_tags == 0L, + NA_real_, + mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR", + "SOLUTION DISCUSSION", "SOLUTION USAGE", + "INVESTIGATION AND EXPLORATION"), na.rm = TRUE) + ), + .groups = "drop" + ) |> + select(-n_tags) + +first_join <- unified_df |> + left_join( + olmo_list_unified_df, + by="id" + ) + +second_join <- first_join |> + left_join( + human_list_unified_df, + by="id" + ) library(ggdist) -ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) + - stat_slabinterval() + - xlim(0, 5) + +ggplot(second_join, aes(x = olmo_VR_prop, + y = human_VR_prop, + ymin = 0, ymax = 1)) + + facet_grid(~source, scales="fixed") + + geom_point(shape = 21, alpha=0.3, size=2) + + geom_abline() + + geom_smooth()+ + xlim(0, 1) + + ylim(0, 1) + + scale_fill_viridis_d() + + theme_minimal() + labs( - title = "Distribution of modal_verbs by isAuthorWMF", - x = "Number of modal verbs in comment", - y = "isAuthorWMF" - ) + - theme_minimal() + title = "Tags of OLMO solution % and Human solution %", + x = "OLMO solution % tag", + y = "Human solution % tag", + ) + +ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) + + facet_grid(~source, scales="fixed") + + geom_point(shape = 19, alpha=0.3, size=2) + + scale_fill_viridis_d() + + xlim(0, 20) + + theme_minimal() + + labs( + title = "Modal Verbs v. PC3", + x = "modal verb count", + y = "PC3", + ) + + +ggplot(second_join, aes( + x = as.factor(comment_type), # x-axis grouping + y = olmo_VR_prop, + fill = isAuthorWMF +)) + + ylim(0, 3) + + geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) + + facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed + scale_fill_viridis_d() + + theme_minimal() + + labs( + title = "Boxplot of modal verb usage", + x = "Comment_type", + y = "Count of modal verbs", + fill = "isAuthorWMF?" + ) + diff --git a/dsl/093025_power_dsl.csv b/dsl/archived_dsl_data/093025_power_dsl.csv similarity index 100% rename from dsl/093025_power_dsl.csv rename to dsl/archived_dsl_data/093025_power_dsl.csv diff --git a/dsl/dsl_data_transform.R b/dsl/archived_dsl_data/dsl_data_transform.R similarity index 100% rename from dsl/dsl_data_transform.R rename to dsl/archived_dsl_data/dsl_data_transform.R diff --git a/dsl/dsl_aggregation.R b/dsl/dsl_aggregation.R new file mode 100644 index 0000000..ca8b6af --- /dev/null +++ b/dsl/dsl_aggregation.R @@ -0,0 +1,16 @@ +library(tidyverse) + +unified_csv <-"~/analysis_data/102125_unified.csv" +unified_df <- read.csv(human_csv, header = TRUE) + +# 1. aggregate to the task level +# 1a. create human info proportions (ADAC/general) +# 1b. create OLMO info proportions (ADAC/general) +# 1c. + +# 2. assign sampling prob for different tasks +# refer to DSL specification sheet + +# 3. check validity of different aggregate variables + +# 4. save \ No newline at end of file