mw-lifecycle-analysis/p2/quest/neurobiber_PCA_analysis.R

library(tidyverse)
library(dplyr)

main_csv <- "~/analysis_data/121625_unified.csv"
main_df <- read.csv(main_csv  , header = TRUE)
length(unique(main_df$id))

preprocess_comment <- function(message) {
  library(stringr)
  comment_text <- message
  # 1. replace code with CODE
  # Inline code: `...`
  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
  # Block code: ```...```
  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
  # 2. replace quotes with QUOTE
  lines <- unlist(strsplit(comment_text, "\n"))
  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
  comment_text <- paste(lines, collapse = "\n")
  # 3. replace Gerrit URLs with GERRIT_URL
  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
  # replace URL with URL
  url_pattern <- "https?://[^\\s]+"
  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
  # 4. replace @screenname with SCREEN_NAME
  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
  return(cleaned_message)
}
main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)

# look at the representative comments for PC1 and PC2
top5 <- main_df %>%
  arrange(desc(PC3)) %>%
  slice(250:260) %>%
  pull(cleaned_comment)

bottom5 <- main_df %>%
  arrange(PC3) %>%
  slice(250:260) %>%
  pull(cleaned_comment)

cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)

cat("\nBottom 300:310 comment_text by PC2 score:\n")
print(bottom5)


comments_style <- main_df |>
  ggplot(
    aes(
      x = PC1,
      y = PC4,
      fill = comment_type
    )
  ) +
  facet_grid(~source, scales="fixed",
             labeller = as_labeller(c(
               "c1" = "VisualEditor",
               "c2"   = "HTTPS-login",
               "c3"  = "HTTP-deprecation"
             ))) +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-50, 50) +
  ylim(-50, 50) +
  scale_fill_viridis_d(
    option = "magma",
    name = "Comment type",
    labels = c("Task Description", "Reply"))+
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
    x = "Lengthy Discussion v. Brief Updates (PC1)",
    y = "Technical Jargon v. Non-technical Observations (PC4)",
  )
ggsave(
  filename = "121625_comments_style.png",
  plot = comments_style,
  width = 12,    # inches
  height = 8,   # inches
  dpi = 800     # high resolution
)

adac_style <- main_df |>
  filter(ADAC == 1) |>
  ggplot(
    aes(
      x = PC3,
      y = PC4,
      fill = as.factor(isAuthorWMF)
    )
  ) +
  facet_grid(~source,
             labeller = as_labeller(c(
               "c1" = "VisualEditor",
               "c2"   = "HTTPS-login",
               "c3"  = "HTTP-deprecation",
               "task_description" = "Task Description",
               "task_subcomment" = "Follow-up Reply"
             ))) +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-50, 50) +
  ylim(-50, 50) +
  scale_fill_viridis_d()+
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
    x = "Expressive, first-person v. Dry, third-person (PC3)",
    y = "Technical Jargon v. Non-technical Observations (PC4)",
  )
#"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
ggsave(
  filename = "121625_adac_affil_style.png",
  plot = adac_style,
  width = 12,    # inches
  height = 8,   # inches
  dpi = 800     # high resolution
)


main_df |>
  filter(comment_type=="task_subcomment") |>
  ggplot(
    aes(
      x = PC4,
      y = PC3,
      fill = as.factor(ADAC)
    )
  ) +
  facet_grid(ADAC~source,
             labeller = as_labeller(c(
               "c1" = "VisualEditor (c1)",
               "c2"   = "HTTPS-as-default (c2)",
               "c3"  = "HTTP-deprecation (c3)"
             ))) +
  geom_point(shape = 21, alpha=0.13, size=2) +
  scale_fill_viridis_d(
    option = "inferno",
    name = "By Task Author Before Resolution",
    labels = c("No", "Yes"))+
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
    title = "PCs for Replies (by Author Affiliation, Case, and Comment Type)",
    x = "Casual v. Formal Updates (PC3)",
    y = "Technical-matter v. Procedural Commentary (PC4)",
  )

main_df <- main_df |>
  mutate(
    comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
  )