library(tidyverse) library(dplyr) neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv" neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text) neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv" neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text) pca_csv <- "~/p2/quest/102025_total_pca_df.csv" pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text) main_csv <- "~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv , header = TRUE) main_df <- main_df |> select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority) # Join main_df to neurobiber_description_pca_df description_joined <- main_df |> right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |> filter(comment_text != "nan") #TODO: look at this more in depth # Join main_df to neurobiber_subcomment_pca_df subcomment_joined <- main_df |> right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |> filter(comment_text != "nan") #TODO: look at this more in depth total_joined <- main_df |> right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |> filter(comment_text != "nan") #TODO: look at this more in depth preprocess_comment <- function(message) { library(stringr) comment_text <- message # 1. replace code with CODE # Inline code: `...` comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE") # Block code: ```...``` comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE") # 2. replace quotes with QUOTE lines <- unlist(strsplit(comment_text, "\n")) lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines) comment_text <- paste(lines, collapse = "\n") # 3. replace Gerrit URLs with GERRIT_URL gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+" comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL") # replace URL with URL url_pattern <- "https?://[^\\s]+" comment_text <- str_replace_all(comment_text, url_pattern, "URL") # 4. replace @screenname with SCREEN_NAME cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME") return(cleaned_message) } # Add comment_type column to each df neurobiber_description_pca_df$comment_type <- "task_description" neurobiber_subcomment_pca_df$comment_type <- "subcomment" #clean the messages neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment) neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment) subcomment_joined <- subcomment_joined %>% mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in% paste(neurobiber_description_pca_df$AuthorPHID, neurobiber_description_pca_df$TaskPHID))) # look at correlation between PC1, PC2, and different outcome variables description_anova_results <- neurobiber_description_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) description_anova_results discussion_anova_results <- neurobiber_subcomment_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) discussion_anova_results # look at the representative comments for PC1 and PC2 top5 <- neurobiber_description_pca_df %>% arrange(desc(PC2)) %>% slice(300:310) %>% pull(cleaned_comment) bottom5 <- neurobiber_description_pca_df %>% arrange(PC2) %>% slice(300:310) %>% pull(cleaned_comment) cat("Top 300:310 comment_text by PC2 score:\n") print(top5) cat("\nBottom 300:310 comment_text by PC2 score:\n") print(bottom5) library(scales) library(ggplot2) affiliationColors <- setNames( c('#5da2d8', '#c7756a') ,c("False", "True")) subcomment_joined_no_gerrit <- subcomment_joined |> filter(isGerritBot != "TRUE") |> left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID") #unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) #unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] # geom_point(shape = 21, alpha=0.4, size=2) + # geom_bin_2d() + sampled_authors <- subcomment_joined_no_gerrit %>% distinct(AuthorPHID) %>% sample_n(100) %>% pull(AuthorPHID) # 2. Filter original data to just those authors sub_sample <- subcomment_joined_no_gerrit %>% filter(AuthorPHID %in% sampled_authors) description_sampled_authors <- description_joined %>% distinct(AuthorPHID) %>% sample_n(8) %>% pull(AuthorPHID) # 2. Filter original data to just those authors description_sub_sample <- description_joined %>% filter(AuthorPHID %in% description_sampled_authors) ggplot(description_sub_sample, aes(x = PC2, y = PC1, fill = AuthorPHID)) + facet_grid(source~phase, scales="fixed") + geom_point(shape = 21, alpha=0.3, size=2) + xlim(-30, 30) + ylim(-30, 30) + scale_fill_brewer(palette = "Set1") + theme_minimal() + guides(fill = "none") + labs( title = "PCs for Task Comments (Faceted by source and phase)", x = "PC2", y = "PC1", ) priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage") subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>% mutate(priority = factor(priority, levels = priority_order)) description_joined <- description_joined %>% mutate(priority = factor(priority.y, levels = priority_order)) ggplot(total_joined, aes( x = PC1, # x-axis grouping y = PC2, fill = comment_type )) + ylim(-20, 20) + geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) + facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed scale_fill_viridis_d() + theme_minimal() + labs( title = "Boxplot of PC2 for Task Descriptions", x = "Task priority", y = "PC2", fill = "isAuthorWMF?" )