library(tidyverse) neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv" neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv" neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) preprocess_comment <- function(message) { library(stringr) comment_text <- message # 1. replace code with CODE # Inline code: `...` comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE") # Block code: ```...``` comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE") # 2. replace quotes with QUOTE lines <- unlist(strsplit(comment_text, "\n")) lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines) comment_text <- paste(lines, collapse = "\n") # 3. replace Gerrit URLs with GERRIT_URL gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+" comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL") # replace URL with URL url_pattern <- "https?://[^\\s]+" comment_text <- str_replace_all(comment_text, url_pattern, "URL") # 4. replace @screenname with SCREEN_NAME cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME") return(cleaned_message) } # Add comment_type column to each df neurobiber_description_pca_df$comment_type <- "task_description" neurobiber_subcomment_pca_df$comment_type <- "subcomment" #clean the messages neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment) neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment) # look at correlation between PC1, PC2, and different outcome variables library(dplyr) description_anova_results <- neurobiber_description_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) description_anova_results discussion_anova_results <- neurobiber_subcomment_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) discussion_anova_results # look at the representative comments for PC1 and PC2 top5 <- neurobiber_subcomment_pca_df %>% arrange(desc(PC6)) %>% slice(300:310) %>% pull(cleaned_comment) bottom5 <- neurobiber_subcomment_pca_df %>% arrange(PC6) %>% slice(300:310) %>% pull(cleaned_comment) cat("Top 300:310 comment_text by PC1 score:\n") print(top5) cat("\nBottom 300:310 comment_text by PC1 score:\n") print(bottom5) aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |> group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>% summarise(mean_PC1 = median(PC1), mean_PC2 = median(PC2), mean_PC3 = median(PC3), mean_PC4 = median(PC4), mean_PC5 = median(PC5)) library(scales) library(ggplot2) affiliationColors <- setNames( c('#5da2d8', '#c7756a') ,c("False", "True")) long_df <- aggregated_neurobiber_description_pca_df %>% tidyr::pivot_longer( cols = starts_with("mean_PC"), names_to = "PC", values_to = "PC_value" ) unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] ggplot(neurobiber_description_pca_df, aes(x = PC1, y = PC3, fill = closed_relevance)) + geom_point(shape = 21, alpha=0.3, size=2) + facet_grid(source ~ phase) + scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice theme_minimal() + labs( title = "PCs for Task Subcomments (Faceted by Source and Phase)", x = "PC1", y = "PC3", fill = "(tentative affiliation)" )