1
0

preparing DSL modeling, looking at OLMO category data

This commit is contained in:
Matthew Gaughan 2025-09-07 13:21:45 -07:00
parent 99c702fe20
commit 77fc3ec541
4 changed files with 734183 additions and 18 deletions

23
dsl/dsl_pp_power.R Normal file
View File

@ -0,0 +1,23 @@
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)
# TODO
# join the label data with the existing data from 0714 master
# download and set up DSL library
# figure out how to use the sentence-level variables
# get the categorical variables encoded as integers, then wrapped as factors
# figure out power at 200, 400, 500, 750, and 1000
#joining sentences with their
olmo_categorization_csv <-"~/dsl/inter_090725_sent_cats.csv"
sl_olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
main_csv <- "~/p2/071425_master_discussion_data.csv"
main_df <- read.csv(main_csv, header = TRUE)
joined_df <- left_join(
sl_olmo_categorization_df,
main_df %>% select(id, AuthorPHID),
by = "id"
)

734079
dsl/inter_090725_sent_cats.csv Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,18 +0,0 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: 9Qgk9UkRdmKalTKyDmH4
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 28911380
[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***

81
p2/quest/olmo_cat_EDA.R Normal file
View File

@ -0,0 +1,81 @@
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(purrr)
information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR",
"BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION",
"CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN",
"POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
"ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
url_extensions = c("GERRIT_URL", "URL")
olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
olmo_categorization_df <- olmo_categorization_df %>%
mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
mutate(
sentence_categories_list = map(
sentence_categories_list,
~ .x[!str_trim(.x) == ""]
)
)
categories_df <- olmo_categorization_df |>
unnest(sentence_categories_list) |>
mutate(sent_cat_label = str_trim(sentence_categories_list))|>
filter(sent_cat_label != ",")
#cleaning
categories_df <- categories_df |>
mutate(
sent_cat_label = if_else(
str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
"URL",
sent_cat_label
)
) |>
mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND",
"WORKAROUNDS",
sent_cat_label))|>
mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION",
"SOLUTION DISCUSSION",
sent_cat_label))|>
mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT",
"ISSUE CONTENT MANAGEMENT",
sent_cat_label)) |>
mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
sent_cat_label %in% url_extensions,
sent_cat_label,
"Nonspecified Label"))
table(categories_df$final_cat_label, useNA = "ifany")
write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)
library(forcats)
plot_df <- categories_df %>%
group_by(comment_type, final_cat_label) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(comment_type) %>%
mutate(percent = n / sum(n) * 100)
plot_df <- plot_df %>%
group_by(comment_type) %>%
mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
ungroup()
ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
geom_bar(stat = "identity") +
geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
facet_wrap(~ comment_type, scales = "free_x") +
theme_minimal() +
xlab("Label") +
ylab("%") +
ggtitle("Distribution of OLMO Category Labels by Comment Type") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))