adding some analysis of modal terms and olmo labels
This commit is contained in:
parent
e5ca779900
commit
e955b4f50f
@ -4,33 +4,182 @@ library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
|
||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
unified_csv <-"~/analysis_data/102425_unified.csv"
|
||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||
|
||||
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
|
||||
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
|
||||
|
||||
modal_verb_list <- c("will", "may", "can", "shall", "must",
|
||||
"ought", "do", "need", "dare",
|
||||
"will not", "may not", "cannot", "shall not",
|
||||
"must not", "do not", "don't", "need not",
|
||||
"dare not", "won't", "can't")
|
||||
modal_regex <- paste0("\\b(", paste(modal_verb_list, collapse = "|"), ")\\b")
|
||||
human_list_unified_df <- unified_df %>%
|
||||
filter(!is.na(human_labels))|>
|
||||
mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
|
||||
mutate(list_human_labels = map(human_labels, ~ {
|
||||
if (is.na(.x)) {
|
||||
NA_character_
|
||||
} else if (str_detect(.x, '^\\s*c\\(')) {
|
||||
eval(parse(text = .x))
|
||||
} else {
|
||||
.x
|
||||
}
|
||||
})) %>%
|
||||
unnest(list_human_labels, keep_empty = TRUE) |>
|
||||
filter(list_human_labels != "NA") |>
|
||||
group_by(id)|>
|
||||
summarise(
|
||||
n_tags = sum(!is.na(list_human_labels)),
|
||||
human_BE_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
human_SOL_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
human_VR_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
select(-n_tags)
|
||||
|
||||
main_df <- main_df |>
|
||||
mutate(
|
||||
comment_text = dplyr::coalesce(comment_text, ""), # handle NA
|
||||
modal_verbs = stringr::str_count(comment_text, stringr::regex(modal_regex, ignore_case = TRUE)),
|
||||
log1p_mv = log1p(modal_verbs)
|
||||
)
|
||||
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
||||
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
|
||||
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
|
||||
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
||||
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
||||
'SOCIAL CONVERSATION')
|
||||
|
||||
unique_olmo_labels <- unified_df %>%
|
||||
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
|
||||
mutate(list_olmo_labels = str_extract_all(
|
||||
olmo_sentence_labels,
|
||||
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
||||
)) %>%
|
||||
unnest(list_olmo_labels, keep_empty = TRUE) %>%
|
||||
# drop empty / NA / literal "NA" entries
|
||||
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
||||
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
||||
filter(list_olmo_labels != "") %>%
|
||||
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
||||
"WORKAROUND",
|
||||
list_olmo_labels))|>
|
||||
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
||||
"BUG REPRODUCTION",
|
||||
olmo_label))|>
|
||||
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
||||
"INVALID LABEL",
|
||||
olmo_label))|>
|
||||
pull(olmo_label) %>%
|
||||
unique() %>%
|
||||
sort()
|
||||
|
||||
table(main_df$modal_verbs)
|
||||
print(unique_olmo_labels)
|
||||
|
||||
olmo_list_unified_df <- unified_df %>%
|
||||
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
|
||||
mutate(list_olmo_labels = str_extract_all(
|
||||
olmo_sentence_labels,
|
||||
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
||||
)) %>%
|
||||
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
||||
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
||||
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
||||
filter(list_olmo_labels != "") %>%
|
||||
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
||||
"WORKAROUND",
|
||||
list_olmo_labels))|>
|
||||
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
||||
"BUG REPRODUCTION",
|
||||
olmo_label))|>
|
||||
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
||||
"INVALID LABEL",
|
||||
olmo_label))|>
|
||||
group_by(id)|>
|
||||
summarise(
|
||||
n_tags = sum(!is.na(olmo_label)),
|
||||
olmo_BE_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
||||
),
|
||||
olmo_SOL_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
||||
),
|
||||
olmo_VR_prop = if_else(
|
||||
n_tags == 0L,
|
||||
NA_real_,
|
||||
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
||||
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
||||
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
||||
),
|
||||
.groups = "drop"
|
||||
) |>
|
||||
select(-n_tags)
|
||||
|
||||
first_join <- unified_df |>
|
||||
left_join(
|
||||
olmo_list_unified_df,
|
||||
by="id"
|
||||
)
|
||||
|
||||
second_join <- first_join |>
|
||||
left_join(
|
||||
human_list_unified_df,
|
||||
by="id"
|
||||
)
|
||||
library(ggdist)
|
||||
ggplot(main_df, aes(x = modal_verbs, y = isAuthorWMF)) +
|
||||
stat_slabinterval() +
|
||||
xlim(0, 5) +
|
||||
ggplot(second_join, aes(x = olmo_VR_prop,
|
||||
y = human_VR_prop,
|
||||
ymin = 0, ymax = 1)) +
|
||||
facet_grid(~source, scales="fixed") +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
geom_abline() +
|
||||
geom_smooth()+
|
||||
xlim(0, 1) +
|
||||
ylim(0, 1) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Distribution of modal_verbs by isAuthorWMF",
|
||||
x = "Number of modal verbs in comment",
|
||||
y = "isAuthorWMF"
|
||||
) +
|
||||
theme_minimal()
|
||||
title = "Tags of OLMO solution % and Human solution %",
|
||||
x = "OLMO solution % tag",
|
||||
y = "Human solution % tag",
|
||||
)
|
||||
|
||||
ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
|
||||
facet_grid(~source, scales="fixed") +
|
||||
geom_point(shape = 19, alpha=0.3, size=2) +
|
||||
scale_fill_viridis_d() +
|
||||
xlim(0, 20) +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Modal Verbs v. PC3",
|
||||
x = "modal verb count",
|
||||
y = "PC3",
|
||||
)
|
||||
|
||||
|
||||
ggplot(second_join, aes(
|
||||
x = as.factor(comment_type), # x-axis grouping
|
||||
y = olmo_VR_prop,
|
||||
fill = isAuthorWMF
|
||||
)) +
|
||||
ylim(0, 3) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of modal verb usage",
|
||||
x = "Comment_type",
|
||||
y = "Count of modal verbs",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
|
||||
16
dsl/dsl_aggregation.R
Normal file
16
dsl/dsl_aggregation.R
Normal file
@ -0,0 +1,16 @@
|
||||
library(tidyverse)
|
||||
|
||||
unified_csv <-"~/analysis_data/102125_unified.csv"
|
||||
unified_df <- read.csv(human_csv, header = TRUE)
|
||||
|
||||
# 1. aggregate to the task level
|
||||
# 1a. create human info proportions (ADAC/general)
|
||||
# 1b. create OLMO info proportions (ADAC/general)
|
||||
# 1c.
|
||||
|
||||
# 2. assign sampling prob for different tasks
|
||||
# refer to DSL specification sheet
|
||||
|
||||
# 3. check validity of different aggregate variables
|
||||
|
||||
# 4. save
|
||||
Loading…
Reference in New Issue
Block a user