mw-lifecycle-analysis/p2/quest/neurobiber_PCA_analysis.R

library(tidyverse)
library(dplyr)
neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)

neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)

main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
main_df <- read.csv(main_csv  , header = TRUE)

main_df <- main_df |>
  select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
# Join main_df to neurobiber_description_pca_df
description_joined <- main_df |>
  right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
  filter(comment_text != "nan") #TODO: look at this more in depth

# Join main_df to neurobiber_subcomment_pca_df
subcomment_joined <- main_df |>
  right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
  filter(comment_text != "nan") #TODO: look at this more in depth

preprocess_comment <- function(message) {
  library(stringr)
  comment_text <- message
  # 1. replace code with CODE
  # Inline code: `...`
  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
  # Block code: ```...```
  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
  # 2. replace quotes with QUOTE
  lines <- unlist(strsplit(comment_text, "\n"))
  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
  comment_text <- paste(lines, collapse = "\n")
  # 3. replace Gerrit URLs with GERRIT_URL
  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
  # replace URL with URL
  url_pattern <- "https?://[^\\s]+"
  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
  # 4. replace @screenname with SCREEN_NAME
  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
  return(cleaned_message)
}

# Add comment_type column to each df
neurobiber_description_pca_df$comment_type <- "task_description"
neurobiber_subcomment_pca_df$comment_type <- "subcomment"

#clean the messages
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)

subcomment_joined <- subcomment_joined %>%
  mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
                                  paste(neurobiber_description_pca_df$AuthorPHID,
                                        neurobiber_description_pca_df$TaskPHID)))

# look at correlation between PC1, PC2, and different outcome variables
description_anova_results <- neurobiber_description_pca_df %>%
  group_by(source) %>%
  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
description_anova_results

discussion_anova_results <- neurobiber_subcomment_pca_df %>%
  group_by(source) %>%
  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
discussion_anova_results

# look at the representative comments for PC1 and PC2
top5 <- neurobiber_description_pca_df %>%
  arrange(desc(PC2)) %>%
  slice(300:310) %>%
  pull(cleaned_comment)

bottom5 <- neurobiber_description_pca_df %>%
  arrange(PC2) %>%
  slice(300:310) %>%
  pull(cleaned_comment)

cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)

cat("\nBottom 300:310 comment_text by PC2 score:\n")
print(bottom5)


library(scales)
library(ggplot2)


affiliationColors <-
  setNames( c('#5da2d8', '#c7756a')
            ,c("False", "True"))

subcomment_joined_no_gerrit <- subcomment_joined |>
  filter(isGerritBot != "TRUE") |>
  left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")


#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
# geom_point(shape = 21, alpha=0.4, size=2) +
# geom_bin_2d() +

sampled_authors <- subcomment_joined_no_gerrit %>%
  distinct(AuthorPHID) %>%
  sample_n(100) %>%
  pull(AuthorPHID)

# 2. Filter original data to just those authors
sub_sample <- subcomment_joined_no_gerrit %>%
  filter(AuthorPHID %in% sampled_authors)

description_sampled_authors <- description_joined %>%
  distinct(AuthorPHID) %>%
  sample_n(8) %>%
  pull(AuthorPHID)

# 2. Filter original data to just those authors
description_sub_sample <- description_joined %>%
  filter(AuthorPHID %in% description_sampled_authors)

ggplot(description_sub_sample, aes(x = PC2, y = PC1, fill = AuthorPHID)) +
  facet_grid(source~phase, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-30, 30) +
  ylim(-30, 30) +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  guides(fill = "none") +
  labs(
    title = "PCs for Task Comments (Faceted by source and phase)",
    x = "PC2",
    y = "PC1",
  )

priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")

subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
  mutate(priority = factor(priority, levels = priority_order))

description_joined <- description_joined %>%
  mutate(priority = factor(priority.y, levels = priority_order))

ggplot(description_joined, aes(
  x = as.factor(priority),    # x-axis grouping
  y = PC2,
  fill = AuthorPHID
)) +
  ylim(-20, 20) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC2 for Task Descriptions",
    x = "Task priority",
    y = "PC2",
    fill = "isAuthorWMF?"
  )