preparing DSL modeling, looking at OLMO category data

2025-09-07 13:21:45 -07:00 · 2025-09-07 13:21:45 -07:00 · 77fc3ec541
commit 77fc3ec541
parent 99c702fe20
4 changed files with 734183 additions and 18 deletions
--- a/dsl/dsl_pp_power.R
+++ b/dsl/dsl_pp_power.R
@ -0,0 +1,23 @@
+library(tidyverse)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(purrr)
+# TODO
+    # join the label data with the existing data from 0714 master
+    # download and set up DSL library 
+    # figure out how to use the sentence-level variables 
+    # get the categorical variables encoded as integers, then wrapped as factors
+    # figure out power at 200, 400, 500, 750, and 1000 
+#joining sentences with their 
+olmo_categorization_csv <-"~/dsl/inter_090725_sent_cats.csv"
+sl_olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE) 
+
+main_csv <- "~/p2/071425_master_discussion_data.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+joined_df <- left_join(
+  sl_olmo_categorization_df,
+  main_df %>% select(id, AuthorPHID),
+  by = "id"
+)
--- a/dsl/inter_090725_sent_cats.csv
+++ b/dsl/inter_090725_sent_cats.csv
--- a/mgaughan-rstudio-server_28911380.out
+++ b/mgaughan-rstudio-server_28911380.out
@ -1,18 +0,0 @@
-1. SSH tunnel from your workstation using the following command:
-
-   ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
-
-   and point your web browser to http://localhost:8787
-
-2. log in to RStudio Server using the following credentials:
-
-   user: mjilg
-   password: 9Qgk9UkRdmKalTKyDmH4
-
-When done using RStudio Server, terminate the job by:
-
-1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
-2. Issue the following command on the login node:
-
-      scancel -f 28911380
-[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***
--- a/p2/quest/olmo_cat_EDA.R
+++ b/p2/quest/olmo_cat_EDA.R
@ -0,0 +1,81 @@
+library(tidyverse)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(purrr)
+
+information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR", 
+                         "BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION", 
+                         "CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN", 
+                         "POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
+                         "ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
+url_extensions = c("GERRIT_URL", "URL")
+
+olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
+olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE) 
+
+olmo_categorization_df <- olmo_categorization_df %>%
+  mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
+  mutate(
+    sentence_categories_list = map(
+      sentence_categories_list,
+      ~ .x[!str_trim(.x) == ""]
+    )
+  )
+
+categories_df <- olmo_categorization_df |>
+  unnest(sentence_categories_list) |>
+  mutate(sent_cat_label = str_trim(sentence_categories_list))|>
+  filter(sent_cat_label != ",")
+
+#cleaning 
+categories_df <- categories_df |>
+  mutate(
+    sent_cat_label = if_else(
+      str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
+      "URL",
+      sent_cat_label
+    )
+  ) |>
+  mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND", 
+                                  "WORKAROUNDS", 
+                                  sent_cat_label))|>
+  mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION", 
+                                  "SOLUTION DISCUSSION", 
+                                  sent_cat_label))|>
+  mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT", 
+                                  "ISSUE CONTENT MANAGEMENT", 
+                                  sent_cat_label)) |>
+  mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
+                                      sent_cat_label %in% url_extensions,
+                                    sent_cat_label,
+                                    "Nonspecified Label"))
+
+
+table(categories_df$final_cat_label, useNA = "ifany")
+
+write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)
+
+library(forcats)
+plot_df <- categories_df %>%
+  group_by(comment_type, final_cat_label) %>%
+  summarise(n = n(), .groups = "drop") %>%
+  group_by(comment_type) %>%
+  mutate(percent = n / sum(n) * 100)
+
+plot_df <- plot_df %>%
+  group_by(comment_type) %>%
+  mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
+  ungroup()
+
+ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
+  geom_bar(stat = "identity") +
+  geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
+  facet_wrap(~ comment_type, scales = "free_x") +
+  theme_minimal() +
+  xlab("Label") +
+  ylab("%") +
+  ggtitle("Distribution of OLMO Category Labels by Comment Type") +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+