preparing DSL modeling, looking at OLMO category data
This commit is contained in:
parent
99c702fe20
commit
77fc3ec541
23
dsl/dsl_pp_power.R
Normal file
23
dsl/dsl_pp_power.R
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(stringr)
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(purrr)
|
||||||
|
# TODO
|
||||||
|
# join the label data with the existing data from 0714 master
|
||||||
|
# download and set up DSL library
|
||||||
|
# figure out how to use the sentence-level variables
|
||||||
|
# get the categorical variables encoded as integers, then wrapped as factors
|
||||||
|
# figure out power at 200, 400, 500, 750, and 1000
|
||||||
|
#joining sentences with their
|
||||||
|
olmo_categorization_csv <-"~/dsl/inter_090725_sent_cats.csv"
|
||||||
|
sl_olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
|
||||||
|
|
||||||
|
main_csv <- "~/p2/071425_master_discussion_data.csv"
|
||||||
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
|
joined_df <- left_join(
|
||||||
|
sl_olmo_categorization_df,
|
||||||
|
main_df %>% select(id, AuthorPHID),
|
||||||
|
by = "id"
|
||||||
|
)
|
734079
dsl/inter_090725_sent_cats.csv
Normal file
734079
dsl/inter_090725_sent_cats.csv
Normal file
File diff suppressed because one or more lines are too long
@ -1,18 +0,0 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
|
||||||
|
|
||||||
ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
|
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
|
||||||
|
|
||||||
user: mjilg
|
|
||||||
password: 9Qgk9UkRdmKalTKyDmH4
|
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
|
||||||
2. Issue the following command on the login node:
|
|
||||||
|
|
||||||
scancel -f 28911380
|
|
||||||
[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***
|
|
81
p2/quest/olmo_cat_EDA.R
Normal file
81
p2/quest/olmo_cat_EDA.R
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(stringr)
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(purrr)
|
||||||
|
|
||||||
|
information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR",
|
||||||
|
"BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION",
|
||||||
|
"CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN",
|
||||||
|
"POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
|
||||||
|
"ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
|
||||||
|
url_extensions = c("GERRIT_URL", "URL")
|
||||||
|
|
||||||
|
olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
|
||||||
|
olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
|
||||||
|
|
||||||
|
olmo_categorization_df <- olmo_categorization_df %>%
|
||||||
|
mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
|
||||||
|
mutate(
|
||||||
|
sentence_categories_list = map(
|
||||||
|
sentence_categories_list,
|
||||||
|
~ .x[!str_trim(.x) == ""]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
categories_df <- olmo_categorization_df |>
|
||||||
|
unnest(sentence_categories_list) |>
|
||||||
|
mutate(sent_cat_label = str_trim(sentence_categories_list))|>
|
||||||
|
filter(sent_cat_label != ",")
|
||||||
|
|
||||||
|
#cleaning
|
||||||
|
categories_df <- categories_df |>
|
||||||
|
mutate(
|
||||||
|
sent_cat_label = if_else(
|
||||||
|
str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
|
||||||
|
"URL",
|
||||||
|
sent_cat_label
|
||||||
|
)
|
||||||
|
) |>
|
||||||
|
mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND",
|
||||||
|
"WORKAROUNDS",
|
||||||
|
sent_cat_label))|>
|
||||||
|
mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION",
|
||||||
|
"SOLUTION DISCUSSION",
|
||||||
|
sent_cat_label))|>
|
||||||
|
mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT",
|
||||||
|
"ISSUE CONTENT MANAGEMENT",
|
||||||
|
sent_cat_label)) |>
|
||||||
|
mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
|
||||||
|
sent_cat_label %in% url_extensions,
|
||||||
|
sent_cat_label,
|
||||||
|
"Nonspecified Label"))
|
||||||
|
|
||||||
|
|
||||||
|
table(categories_df$final_cat_label, useNA = "ifany")
|
||||||
|
|
||||||
|
write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)
|
||||||
|
|
||||||
|
library(forcats)
|
||||||
|
plot_df <- categories_df %>%
|
||||||
|
group_by(comment_type, final_cat_label) %>%
|
||||||
|
summarise(n = n(), .groups = "drop") %>%
|
||||||
|
group_by(comment_type) %>%
|
||||||
|
mutate(percent = n / sum(n) * 100)
|
||||||
|
|
||||||
|
plot_df <- plot_df %>%
|
||||||
|
group_by(comment_type) %>%
|
||||||
|
mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
|
||||||
|
ungroup()
|
||||||
|
|
||||||
|
ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
|
||||||
|
geom_bar(stat = "identity") +
|
||||||
|
geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
|
||||||
|
facet_wrap(~ comment_type, scales = "free_x") +
|
||||||
|
theme_minimal() +
|
||||||
|
xlab("Label") +
|
||||||
|
ylab("%") +
|
||||||
|
ggtitle("Distribution of OLMO Category Labels by Comment Type") +
|
||||||
|
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user