82 lines
3.1 KiB
R
82 lines
3.1 KiB
R
library(tidyverse)
|
|
library(stringr)
|
|
library(tidyr)
|
|
library(dplyr)
|
|
library(purrr)
|
|
|
|
information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR",
|
|
"BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION",
|
|
"CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN",
|
|
"POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
|
|
"ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
|
|
url_extensions = c("GERRIT_URL", "URL")
|
|
|
|
olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
|
|
olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
|
|
|
|
olmo_categorization_df <- olmo_categorization_df %>%
|
|
mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
|
|
mutate(
|
|
sentence_categories_list = map(
|
|
sentence_categories_list,
|
|
~ .x[!str_trim(.x) == ""]
|
|
)
|
|
)
|
|
|
|
categories_df <- olmo_categorization_df |>
|
|
unnest(sentence_categories_list) |>
|
|
mutate(sent_cat_label = str_trim(sentence_categories_list))|>
|
|
filter(sent_cat_label != ",")
|
|
|
|
#cleaning
|
|
categories_df <- categories_df |>
|
|
mutate(
|
|
sent_cat_label = if_else(
|
|
str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
|
|
"URL",
|
|
sent_cat_label
|
|
)
|
|
) |>
|
|
mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND",
|
|
"WORKAROUNDS",
|
|
sent_cat_label))|>
|
|
mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION",
|
|
"SOLUTION DISCUSSION",
|
|
sent_cat_label))|>
|
|
mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT",
|
|
"ISSUE CONTENT MANAGEMENT",
|
|
sent_cat_label)) |>
|
|
mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
|
|
sent_cat_label %in% url_extensions,
|
|
sent_cat_label,
|
|
"Nonspecified Label"))
|
|
|
|
|
|
table(categories_df$final_cat_label, useNA = "ifany")
|
|
|
|
write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)
|
|
|
|
library(forcats)
|
|
plot_df <- categories_df %>%
|
|
group_by(comment_type, final_cat_label) %>%
|
|
summarise(n = n(), .groups = "drop") %>%
|
|
group_by(comment_type) %>%
|
|
mutate(percent = n / sum(n) * 100)
|
|
|
|
plot_df <- plot_df %>%
|
|
group_by(comment_type) %>%
|
|
mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
|
|
ungroup()
|
|
|
|
ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
|
|
geom_bar(stat = "identity") +
|
|
geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
|
|
facet_wrap(~ comment_type, scales = "free_x") +
|
|
theme_minimal() +
|
|
xlab("Label") +
|
|
ylab("%") +
|
|
ggtitle("Distribution of OLMO Category Labels by Comment Type") +
|
|
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
|
|
|