preparing DSL modeling, looking at OLMO category data
This commit is contained in:
parent
99c702fe20
commit
77fc3ec541
23
dsl/dsl_pp_power.R
Normal file
23
dsl/dsl_pp_power.R
Normal file
@ -0,0 +1,23 @@
|
||||
library(tidyverse)
|
||||
library(stringr)
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
# TODO
|
||||
# join the label data with the existing data from 0714 master
|
||||
# download and set up DSL library
|
||||
# figure out how to use the sentence-level variables
|
||||
# get the categorical variables encoded as integers, then wrapped as factors
|
||||
# figure out power at 200, 400, 500, 750, and 1000
|
||||
#joining sentences with their
|
||||
olmo_categorization_csv <-"~/dsl/inter_090725_sent_cats.csv"
|
||||
sl_olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
|
||||
|
||||
main_csv <- "~/p2/071425_master_discussion_data.csv"
|
||||
main_df <- read.csv(main_csv, header = TRUE)
|
||||
|
||||
joined_df <- left_join(
|
||||
sl_olmo_categorization_df,
|
||||
main_df %>% select(id, AuthorPHID),
|
||||
by = "id"
|
||||
)
|
734079
dsl/inter_090725_sent_cats.csv
Normal file
734079
dsl/inter_090725_sent_cats.csv
Normal file
File diff suppressed because one or more lines are too long
@ -1,18 +0,0 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: 9Qgk9UkRdmKalTKyDmH4
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 28911380
|
||||
[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***
|
81
p2/quest/olmo_cat_EDA.R
Normal file
81
p2/quest/olmo_cat_EDA.R
Normal file
@ -0,0 +1,81 @@
|
||||
library(tidyverse)
|
||||
library(stringr)
|
||||
library(tidyr)
|
||||
library(dplyr)
|
||||
library(purrr)
|
||||
|
||||
information_typology = c("EXPECTED BEHAVIOR", "MOTIVATION", "OBSERVED BUG BEHAVIOR",
|
||||
"BUG REPRODUCTION", "INVESTIGATION AND EXPLORATION", "SOLUTION DISCUSSION",
|
||||
"CONTRIBUTION AND COMMITMENT", "TASK PROGRESS", "TESTING", "FUTURE PLAN",
|
||||
"POTENTIAL NEW ISSUES AND REQUESTS", "SOLUTION USAGE", "WORKAROUNDS",
|
||||
"ISSUE CONTENT MANAGEMENT", "ACTION ON ISSUE", "SOCIAL CONVERSATION")
|
||||
url_extensions = c("GERRIT_URL", "URL")
|
||||
|
||||
olmo_categorization_csv <-"~/p2/quest/090425_olmo_batched_categorized.csv"
|
||||
olmo_categorization_df <- read.csv(olmo_categorization_csv, header = TRUE)
|
||||
|
||||
olmo_categorization_df <- olmo_categorization_df %>%
|
||||
mutate(sentence_categories_list = str_extract_all(sentence_categories, "(?<=')[^']+(?=')")) |>
|
||||
mutate(
|
||||
sentence_categories_list = map(
|
||||
sentence_categories_list,
|
||||
~ .x[!str_trim(.x) == ""]
|
||||
)
|
||||
)
|
||||
|
||||
categories_df <- olmo_categorization_df |>
|
||||
unnest(sentence_categories_list) |>
|
||||
mutate(sent_cat_label = str_trim(sentence_categories_list))|>
|
||||
filter(sent_cat_label != ",")
|
||||
|
||||
#cleaning
|
||||
categories_df <- categories_df |>
|
||||
mutate(
|
||||
sent_cat_label = if_else(
|
||||
str_detect(sent_cat_label, "URL") & !str_detect(sent_cat_label, "GERRIT_URL"),
|
||||
"URL",
|
||||
sent_cat_label
|
||||
)
|
||||
) |>
|
||||
mutate(sent_cat_label = if_else(sent_cat_label == "WORKAROUND",
|
||||
"WORKAROUNDS",
|
||||
sent_cat_label))|>
|
||||
mutate(sent_cat_label = if_else(sent_cat_label == "CATEGORY: SOLUTION DISCUSSION",
|
||||
"SOLUTION DISCUSSION",
|
||||
sent_cat_label))|>
|
||||
mutate(sent_cat_label = if_else(sent_cat_label == "TYPE: ISSUE CONTENT MANAGEMENT",
|
||||
"ISSUE CONTENT MANAGEMENT",
|
||||
sent_cat_label)) |>
|
||||
mutate(final_cat_label = if_else(sent_cat_label %in% information_typology |
|
||||
sent_cat_label %in% url_extensions,
|
||||
sent_cat_label,
|
||||
"Nonspecified Label"))
|
||||
|
||||
|
||||
table(categories_df$final_cat_label, useNA = "ifany")
|
||||
|
||||
write.csv(categories_df, "~/dsl/inter_090725_sent_cats.csv", row.names = FALSE)
|
||||
|
||||
library(forcats)
|
||||
plot_df <- categories_df %>%
|
||||
group_by(comment_type, final_cat_label) %>%
|
||||
summarise(n = n(), .groups = "drop") %>%
|
||||
group_by(comment_type) %>%
|
||||
mutate(percent = n / sum(n) * 100)
|
||||
|
||||
plot_df <- plot_df %>%
|
||||
group_by(comment_type) %>%
|
||||
mutate(final_cat_label = fct_reorder(final_cat_label, percent, .desc = TRUE)) %>%
|
||||
ungroup()
|
||||
|
||||
ggplot(plot_df, aes(x = final_cat_label, y = percent, fill = final_cat_label)) +
|
||||
geom_bar(stat = "identity") +
|
||||
geom_text(aes(label = sprintf("%.1f%%", percent)), vjust = -0.2, size = 3) +
|
||||
facet_wrap(~ comment_type, scales = "free_x") +
|
||||
theme_minimal() +
|
||||
xlab("Label") +
|
||||
ylab("%") +
|
||||
ggtitle("Distribution of OLMO Category Labels by Comment Type") +
|
||||
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user