216 lines
6.5 KiB
R
216 lines
6.5 KiB
R
library(tidyverse)
|
|
library(stringr)
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(purrr)
|
|
|
|
unified_csv <-"~/analysis_data/102725_unified.csv"
|
|
unified_df <- read.csv(unified_csv, header = TRUE)
|
|
|
|
|
|
unified_df |>
|
|
ggplot(
|
|
aes(
|
|
x=leng,
|
|
y=as.factor(isAuthorWMF)
|
|
)
|
|
) +
|
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
|
scale_fill_viridis_d() +
|
|
theme_minimal()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BE_set <- c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR")
|
|
SOL_set <- c("SOLUTION DISCUSSION", "SOLUTION USAGE")
|
|
|
|
human_list_unified_df <- unified_df %>%
|
|
filter(!is.na(human_labels))|>
|
|
mutate(human_labels = tidyr::replace_na(human_labels, ""))|>
|
|
mutate(list_human_labels = map(human_labels, ~ {
|
|
if (is.na(.x)) {
|
|
NA_character_
|
|
} else if (str_detect(.x, '^\\s*c\\(')) {
|
|
eval(parse(text = .x))
|
|
} else {
|
|
.x
|
|
}
|
|
})) %>%
|
|
unnest(list_human_labels, keep_empty = TRUE) |>
|
|
filter(list_human_labels != "NA") |>
|
|
group_by(id)|>
|
|
summarise(
|
|
n_tags = sum(!is.na(list_human_labels)),
|
|
human_BE_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
human_SOL_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
human_VR_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(list_human_labels %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
.groups = "drop"
|
|
) |>
|
|
select(-n_tags)
|
|
|
|
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
|
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
|
|
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
|
|
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
|
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
|
'SOCIAL CONVERSATION')
|
|
|
|
unique_olmo_labels <- unified_df %>%
|
|
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, "")) %>%
|
|
mutate(list_olmo_labels = str_extract_all(
|
|
olmo_sentence_labels,
|
|
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
|
)) %>%
|
|
unnest(list_olmo_labels, keep_empty = TRUE) %>%
|
|
# drop empty / NA / literal "NA" entries
|
|
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
|
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
|
filter(list_olmo_labels != "") %>%
|
|
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
|
"WORKAROUND",
|
|
list_olmo_labels))|>
|
|
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
|
"BUG REPRODUCTION",
|
|
olmo_label))|>
|
|
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
|
"INVALID LABEL",
|
|
olmo_label))|>
|
|
pull(olmo_label) %>%
|
|
unique() %>%
|
|
sort()
|
|
|
|
print(unique_olmo_labels)
|
|
|
|
olmo_list_unified_df <- unified_df %>%
|
|
mutate(olmo_sentence_labels = tidyr::replace_na(olmo_sentence_labels, ""))|>
|
|
mutate(list_olmo_labels = str_extract_all(
|
|
olmo_sentence_labels,
|
|
"(?<=')[^']+(?=')|(?<=\")[^\"]+(?=\")"
|
|
)) %>%
|
|
unnest(list_olmo_labels, keep_empty = TRUE) |>
|
|
filter(!is.na(list_olmo_labels), list_olmo_labels != "", list_olmo_labels != "NA") %>%
|
|
mutate(list_olmo_labels = str_squish(str_replace_all(list_olmo_labels, "[^\\p{L} ]+", ""))) %>%
|
|
filter(list_olmo_labels != "") %>%
|
|
mutate(olmo_label = ifelse(list_olmo_labels %in% c("WORKAROUNDS", "WORKAROUND"),
|
|
"WORKAROUND",
|
|
list_olmo_labels))|>
|
|
mutate(olmo_label = ifelse(olmo_label %in% c("BUG REPORT", "BUG REPRODUCTION"),
|
|
"BUG REPRODUCTION",
|
|
olmo_label))|>
|
|
mutate(olmo_label = ifelse(!(olmo_label %in% valid_categories),
|
|
"INVALID LABEL",
|
|
olmo_label))|>
|
|
group_by(id)|>
|
|
summarise(
|
|
n_tags = sum(!is.na(olmo_label)),
|
|
olmo_BE_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR"), na.rm = TRUE)
|
|
),
|
|
olmo_SOL_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("SOLUTION DISCUSSION", "SOLUTION USAGE"), na.rm = TRUE)
|
|
),
|
|
olmo_VR_prop = if_else(
|
|
n_tags == 0L,
|
|
NA_real_,
|
|
mean(olmo_label %in% c("OBSERVED BUG BEHAVIOR", "EXPECTED BEHAVIOR",
|
|
"SOLUTION DISCUSSION", "SOLUTION USAGE",
|
|
"INVESTIGATION AND EXPLORATION"), na.rm = TRUE)
|
|
),
|
|
.groups = "drop"
|
|
) |>
|
|
select(-n_tags)
|
|
|
|
first_join <- unified_df |>
|
|
left_join(
|
|
olmo_list_unified_df,
|
|
by="id"
|
|
)
|
|
|
|
second_join <- first_join |>
|
|
left_join(
|
|
human_list_unified_df,
|
|
by="id"
|
|
)
|
|
library(ggdist)
|
|
ggplot(second_join, aes(x = olmo_VR_prop,
|
|
y = human_VR_prop,
|
|
ymin = 0, ymax = 1)) +
|
|
facet_grid(~source, scales="fixed") +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
geom_abline() +
|
|
geom_smooth()+
|
|
xlim(0, 1) +
|
|
ylim(0, 1) +
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Tags of OLMO solution % and Human solution %",
|
|
x = "OLMO solution % tag",
|
|
y = "Human solution % tag",
|
|
)
|
|
|
|
ggplot(second_join, aes(x = modal_verbs, y = PC1, color=comment_type)) +
|
|
facet_grid(~source, scales="fixed") +
|
|
geom_point(shape = 19, alpha=0.3, size=2) +
|
|
scale_fill_viridis_d() +
|
|
xlim(0, 20) +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Modal Verbs v. PC3",
|
|
x = "modal verb count",
|
|
y = "PC3",
|
|
)
|
|
|
|
|
|
ggplot(second_join, aes(
|
|
x = as.factor(comment_type), # x-axis grouping
|
|
y = modal_verbs,
|
|
fill = isAuthorWMF
|
|
)) +
|
|
ylim(0, 3) +
|
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Boxplot of modal verb usage",
|
|
x = "Comment_type",
|
|
y = "Count of modal verbs",
|
|
fill = "isAuthorWMF?"
|
|
)
|
|
|