mw-lifecycle-analysis/p2/quest/neurobiber_PCA_analysis.R

library(tidyverse)

neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)

neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE)


preprocess_comment <- function(message) {
  library(stringr)
  comment_text <- message
  # 1. replace code with CODE
  # Inline code: `...`
  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
  # Block code: ```...```
  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
  # 2. replace quotes with QUOTE
  lines <- unlist(strsplit(comment_text, "\n"))
  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
  comment_text <- paste(lines, collapse = "\n")
  # 3. replace Gerrit URLs with GERRIT_URL
  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
  # replace URL with URL
  url_pattern <- "https?://[^\\s]+"
  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
  # 4. replace @screenname with SCREEN_NAME
  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
  return(cleaned_message)
}

# Add comment_type column to each df
neurobiber_description_pca_df$comment_type <- "task_description"
neurobiber_subcomment_pca_df$comment_type <- "subcomment"

#clean the messages
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)


# look at correlation between PC1, PC2, and different outcome variables
library(dplyr)
description_anova_results <- neurobiber_description_pca_df %>%
  group_by(source) %>%
  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
description_anova_results

discussion_anova_results <- neurobiber_subcomment_pca_df %>%
  group_by(source) %>%
  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
discussion_anova_results

# look at the representative comments for PC1 and PC2
top5 <- neurobiber_subcomment_pca_df %>%
  arrange(desc(PC6)) %>%
  slice(300:310) %>%
  pull(cleaned_comment)

bottom5 <- neurobiber_subcomment_pca_df %>%
  arrange(PC6) %>%
  slice(300:310) %>%
  pull(cleaned_comment)

cat("Top 300:310 comment_text by PC1 score:\n")
print(top5)

cat("\nBottom 300:310 comment_text by PC1 score:\n")
print(bottom5)


aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
  group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
  summarise(mean_PC1 = median(PC1),
            mean_PC2 = median(PC2),
            mean_PC3 = median(PC3),
            mean_PC4 = median(PC4),
            mean_PC5 = median(PC5))
library(scales)
library(ggplot2)


affiliationColors <-
  setNames( c('#5da2d8', '#c7756a')
            ,c("False", "True"))


long_df <- aggregated_neurobiber_description_pca_df %>%
  tidyr::pivot_longer(
    cols = starts_with("mean_PC"),
    names_to = "PC",
    values_to = "PC_value"
  )

unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
ggplot(neurobiber_description_pca_df, aes(x = PC1, y = PC3, fill = closed_relevance)) +
  geom_point(shape = 21, alpha=0.3, size=2) +
  facet_grid(source ~ phase) +
  scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
  theme_minimal() +
  labs(
    title = "PCs for Task Subcomments (Faceted by Source and Phase)",
    x = "PC1",
    y = "PC3",
    fill = "(tentative affiliation)"
  )