preparing DSL modeling, looking at OLMO category data

2025-09-07 13:21:45 -07:00 · 2025-09-07 13:21:45 -07:00 · 77fc3ec541
commit 77fc3ec541
parent 99c702fe20
4 changed files with 734183 additions and 18 deletions
--- a/dsl/dsl_pp_power.R
+++ b/dsl/dsl_pp_power.R
@ -0,0 +1,23 @@
 library(tidyverse)
 library(stringr)
 library(tidyr)
 library(dplyr)
 library(purrr)
 # TODO
    # join the label data with the existing data from 0714 master
    # download and set up DSL library 
    # figure out how to use the sentence-level variables 
    # get the categorical variables encoded as integers, then wrapped as factors
    # figure out power at 200, 400, 500, 750, and 1000 
 #joining sentences with their 
 olmo_categorization_csv <-"~/dsl/inter_090725_sent_cats.csv"
 sl_olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE) 
 main_csv <- "~/p2/071425_master_discussion_data.csv"
 main_df <- read.csv(main_csv, header = TRUE) 
 joined_df <- left_join(
  sl_olmo_categorization_df,
  main_df %>% select(id, AuthorPHID),
  by = "id"
 )
--- a/dsl/inter_090725_sent_cats.csv
+++ b/dsl/inter_090725_sent_cats.csv
--- a/mgaughan-rstudio-server_28911380.out
+++ b/mgaughan-rstudio-server_28911380.out
@ -1,18 +0,0 @@
 1. SSH tunnel from your workstation using the following command:
   ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
   and point your web browser to http://localhost:8787
 2. log in to RStudio Server using the following credentials:
   user: mjilg
   password: 9Qgk9UkRdmKalTKyDmH4
 When done using RStudio Server, terminate the job by:
 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:
      scancel -f 28911380
 [2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***
--- a/p2/quest/olmo_cat_EDA.R
+++ b/p2/quest/olmo_cat_EDA.R
@ -0,0 +1,81 @@
 library(tidyverse)
 library(stringr)
 library(tidyr)
 library(dplyr)
 library(purrr)
 information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR", 
                         "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION", 
                         "CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN", 
                         "POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
                         "ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
 url_extensions = c("GERRIT_URL", "URL")
 olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
 olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE) 
 olmo_categorization_df <- olmo_categorization_df %>%
  mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
  mutate(
    sentence_categories_list = map(
      sentence_categories_list,
      ~ .x[!str_trim(.x) == ""]
    )
  )
 categories_df <- olmo_categorization_df |>
  unnest(sentence_categories_list) |>
  mutate(sent_cat_label = str_trim(sentence_categories_list))|>
  filter(sent_cat_label != ",")
 #cleaning 
 categories_df <- categories_df |>
  mutate(
    sent_cat_label = if_else(
      str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
      "URL",
      sent_cat_label
    )
  ) |>
  mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND", 
                                  "WORKAROUNDS", 
                                  sent_cat_label))|>
  mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION", 
                                  "SOLUTION DISCUSSION", 
                                  sent_cat_label))|>
  mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT", 
                                  "ISSUE CONTENT MANAGEMENT", 
                                  sent_cat_label)) |>
  mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
                                      sent_cat_label %in% url_extensions,
                                    sent_cat_label,
                                    "Nonspecified Label"))
 table(categories_df$final_cat_label, useNA = "ifany")
 write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)
 library(forcats)
 plot_df <- categories_df %>%
  group_by(comment_type, final_cat_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(comment_type) %>%
  mutate(percent = n / sum(n) * 100)
 plot_df <- plot_df %>%
  group_by(comment_type) %>%
  mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
  ungroup()
 ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
  facet_wrap(~ comment_type, scales = "free_x") +
  theme_minimal() +
  xlab("Label") +
  ylab("%") +
  ggtitle("Distribution of OLMO Category Labels by Comment Type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))