1
0

adding some analysis of modal terms and olmo labels

This commit is contained in:
Matthew Gaughan 2025-10-24 14:10:49 -07:00
parent e5ca779900
commit e955b4f50f
4 changed files with 188 additions and 23 deletions

View File

@ -4,33 +4,182 @@ library(tidyr)
library(dplyr)
library(purrr)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv, header = TRUE)
unified_csv <-"~/analysis_data/102425_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
modal_verb_list <- c("will", "may", "can", "shall", "must",
"ought", "do", "need", "dare",
"will not", "may not", "cannot", "shall not",
"must not", "do not", "don't", "need not",
"dare not", "won't", "can't")
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
human_list_unified_df <- unified_df %>%
filter(!is.na(human_labels))|>
mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
mutate(list_human_labels = map(human_labels, ~ {
if (is.na(.x)) {
NA_character_
} else if (str_detect(.x, '^\\s*c\\(')) {
eval(parse(text = .x))
} else {
.x
}
})) %>%
unnest(list_human_labels, keep_empty = TRUE) |>
filter(list_human_labels != "NA") |>
group_by(id)|>
summarise(
n_tags = sum(!is.na(list_human_labels)),
human_BE_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
human_SOL_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
human_VR_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags)
main_df <- main_df |>
mutate(
comment_text = dplyr::coalesce(comment_text, ""), # handle NA
modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
log1p_mv = log1p(modal_verbs)
)
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
'SOCIAL CONVERSATION')
unique_olmo_labels <- unified_df %>%
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
mutate(list_olmo_labels = str_extract_all(
olmo_sentence_labels,
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
)) %>%
unnest(list_olmo_labels, keep_empty = TRUE) %>%
# drop empty / NA / literal "NA" entries
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
filter(list_olmo_labels != "") %>%
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
"WORKAROUND",
list_olmo_labels))|>
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
"BUG REPRODUCTION",
olmo_label))|>
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
"INVALID LABEL",
olmo_label))|>
pull(olmo_label) %>%
unique() %>%
sort()
table(main_df$modal_verbs)
print(unique_olmo_labels)
olmo_list_unified_df <- unified_df %>%
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
mutate(list_olmo_labels = str_extract_all(
olmo_sentence_labels,
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
)) %>%
unnest(list_olmo_labels, keep_empty = TRUE) |>
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
filter(list_olmo_labels != "") %>%
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
"WORKAROUND",
list_olmo_labels))|>
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
"BUG REPRODUCTION",
olmo_label))|>
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
"INVALID LABEL",
olmo_label))|>
group_by(id)|>
summarise(
n_tags = sum(!is.na(olmo_label)),
olmo_BE_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
olmo_SOL_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
olmo_VR_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags)
first_join <- unified_df |>
left_join(
olmo_list_unified_df,
by="id"
)
second_join <- first_join |>
left_join(
human_list_unified_df,
by="id"
)
library(ggdist)
ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
stat_slabinterval() +
xlim(0, 5) +
ggplot(second_join, aes(x = olmo_VR_prop,
y = human_VR_prop,
ymin = 0, ymax = 1)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
geom_abline() +
geom_smooth()+
xlim(0, 1) +
ylim(0, 1) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Distribution of modal_verbs by isAuthorWMF",
x = "Number of modal verbs in comment",
y = "isAuthorWMF"
) +
theme_minimal()
title = "Tags of OLMO solution % and Human solution %",
x = "OLMO solution % tag",
y = "Human solution % tag",
)
ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 19, alpha=0.3, size=2) +
scale_fill_viridis_d() +
xlim(0, 20) +
theme_minimal() +
labs(
title = "Modal Verbs v. PC3",
x = "modal verb count",
y = "PC3",
)
ggplot(second_join, aes(
x = as.factor(comment_type), # x-axis grouping
y = olmo_VR_prop,
fill = isAuthorWMF
)) +
ylim(0, 3) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of modal verb usage",
x = "Comment_type",
y = "Count of modal verbs",
fill = "isAuthorWMF?"
)

16
dsl/dsl_aggregation.R Normal file
View File

@ -0,0 +1,16 @@
library(tidyverse)
unified_csv <-"~/analysis_data/102125_unified.csv"
unified_df <- read.csv(human_csv, header = TRUE)
# 1. aggregate to the task level
# 1a. create human info proportions (ADAC/general)
# 1b. create OLMO info proportions (ADAC/general)
# 1c.
# 2. assign sampling prob for different tasks
# refer to DSL specification sheet
# 3. check validity of different aggregate variables
# 4. save