1
0

adding some analysis of modal terms and olmo labels

This commit is contained in:
Matthew Gaughan 2025-10-24 14:10:49 -07:00
parent e5ca779900
commit e955b4f50f
4 changed files with 188 additions and 23 deletions

View File

@ -4,33 +4,182 @@ library(tidyr)
library(dplyr) library(dplyr)
library(purrr) library(purrr)
main_csv <- "~/analysis_data/100625_unified_w_affil.csv" unified_csv <-"~/analysis_data/102425_unified.csv"
main_df <- read.csv(main_csv, header = TRUE) unified_df <- read.csv(unified_csv, header = TRUE)
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
modal_verb_list <- c("will", "may", "can", "shall", "must", human_list_unified_df <- unified_df %>%
"ought", "do", "need", "dare", filter(!is.na(human_labels))|>
"will not", "may not", "cannot", "shall not", mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
"must not", "do not", "don't", "need not", mutate(list_human_labels = map(human_labels, ~ {
"dare not", "won't", "can't") if (is.na(.x)) {
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b") NA_character_
} else if (str_detect(.x, '^\\s*c\\(')) {
eval(parse(text = .x))
} else {
.x
}
})) %>%
unnest(list_human_labels, keep_empty = TRUE) |>
filter(list_human_labels != "NA") |>
group_by(id)|>
summarise(
n_tags = sum(!is.na(list_human_labels)),
human_BE_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
human_SOL_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
human_VR_prop = if_else(
n_tags == 0L,
NA_real_,
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags)
main_df <- main_df |> valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
mutate( 'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
comment_text = dplyr::coalesce(comment_text, ""), # handle NA 'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)), 'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
log1p_mv = log1p(modal_verbs) 'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
'SOCIAL CONVERSATION')
unique_olmo_labels <- unified_df %>%
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
mutate(list_olmo_labels = str_extract_all(
olmo_sentence_labels,
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
)) %>%
unnest(list_olmo_labels, keep_empty = TRUE) %>%
# drop empty / NA / literal "NA" entries
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
filter(list_olmo_labels != "") %>%
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
"WORKAROUND",
list_olmo_labels))|>
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
"BUG REPRODUCTION",
olmo_label))|>
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
"INVALID LABEL",
olmo_label))|>
pull(olmo_label) %>%
unique() %>%
sort()
print(unique_olmo_labels)
olmo_list_unified_df <- unified_df %>%
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
mutate(list_olmo_labels = str_extract_all(
olmo_sentence_labels,
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
)) %>%
unnest(list_olmo_labels, keep_empty = TRUE) |>
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
filter(list_olmo_labels != "") %>%
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
"WORKAROUND",
list_olmo_labels))|>
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
"BUG REPRODUCTION",
olmo_label))|>
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
"INVALID LABEL",
olmo_label))|>
group_by(id)|>
summarise(
n_tags = sum(!is.na(olmo_label)),
olmo_BE_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
),
olmo_SOL_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
),
olmo_VR_prop = if_else(
n_tags == 0L,
NA_real_,
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
"SOLUTION DISCUSSION", "SOLUTION USAGE",
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
),
.groups = "drop"
) |>
select(-n_tags)
first_join <- unified_df |>
left_join(
olmo_list_unified_df,
by="id"
)
second_join <- first_join |>
left_join(
human_list_unified_df,
by="id"
)
library(ggdist)
ggplot(second_join, aes(x = olmo_VR_prop,
y = human_VR_prop,
ymin = 0, ymax = 1)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
geom_abline() +
geom_smooth()+
xlim(0, 1) +
ylim(0, 1) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Tags of OLMO solution % and Human solution %",
x = "OLMO solution % tag",
y = "Human solution % tag",
)
ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
facet_grid(~source, scales="fixed") +
geom_point(shape = 19, alpha=0.3, size=2) +
scale_fill_viridis_d() +
xlim(0, 20) +
theme_minimal() +
labs(
title = "Modal Verbs v. PC3",
x = "modal verb count",
y = "PC3",
) )
table(main_df$modal_verbs) ggplot(second_join, aes(
library(ggdist) x = as.factor(comment_type), # x-axis grouping
ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) + y = olmo_VR_prop,
stat_slabinterval() + fill = isAuthorWMF
xlim(0, 5) + )) +
ylim(0, 3) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs( labs(
title = "Distribution of modal_verbs by isAuthorWMF", title = "Boxplot of modal verb usage",
x = "Number of modal verbs in comment", x = "Comment_type",
y = "isAuthorWMF" y = "Count of modal verbs",
) + fill = "isAuthorWMF?"
theme_minimal() )

16
dsl/dsl_aggregation.R Normal file
View File

@ -0,0 +1,16 @@
library(tidyverse)
unified_csv <-"~/analysis_data/102125_unified.csv"
unified_df <- read.csv(human_csv, header = TRUE)
# 1. aggregate to the task level
# 1a. create human info proportions (ADAC/general)
# 1b. create OLMO info proportions (ADAC/general)
# 1c.
# 2. assign sampling prob for different tasks
# refer to DSL specification sheet
# 3. check validity of different aggregate variables
# 4. save