mw-lifecycle-analysis/p2/quest/olmo_cat_EDA.R

library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)

information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR",
                         "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION",
                         "CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN",
                         "POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
                         "ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
url_extensions = c("GERRIT_URL", "URL")

olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)

olmo_categorization_df <- olmo_categorization_df %>%
  mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
  mutate(
    sentence_categories_list = map(
      sentence_categories_list,
      ~ .x[!str_trim(.x) == ""]
    )
  )

categories_df <- olmo_categorization_df |>
  unnest(sentence_categories_list) |>
  mutate(sent_cat_label = str_trim(sentence_categories_list))|>
  filter(sent_cat_label != ",")

#cleaning
categories_df <- categories_df |>
  mutate(
    sent_cat_label = if_else(
      str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
      "URL",
      sent_cat_label
    )
  ) |>
  mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND",
                                  "WORKAROUNDS",
                                  sent_cat_label))|>
  mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION",
                                  "SOLUTION DISCUSSION",
                                  sent_cat_label))|>
  mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT",
                                  "ISSUE CONTENT MANAGEMENT",
                                  sent_cat_label)) |>
  mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
                                      sent_cat_label %in% url_extensions,
                                    sent_cat_label,
                                    "Nonspecified Label"))


table(categories_df$final_cat_label, useNA = "ifany")

write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)

library(forcats)
plot_df <- categories_df %>%
  group_by(comment_type, final_cat_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(comment_type) %>%
  mutate(percent = n / sum(n) * 100)

plot_df <- plot_df %>%
  group_by(comment_type) %>%
  mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
  ungroup()

ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
  facet_wrap(~ comment_type, scales = "free_x") +
  theme_minimal() +
  xlab("Label") +
  ylab("%") +
  ggtitle("Distribution of OLMO Category Labels by Comment Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))