1
0
mw-lifecycle-analysis/p2/quest/neurobiber_PCA_analysis.R
2025-10-01 20:58:55 -07:00

110 lines
3.9 KiB
R

library(tidyverse)
neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
preprocess_comment <- function(message) {
library(stringr)
comment_text <- message
# 1. replace code with CODE
# Inline code: `...`
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
# Block code: ```...```
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
# 2. replace quotes with QUOTE
lines <- unlist(strsplit(comment_text, "\n"))
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
comment_text <- paste(lines, collapse = "\n")
# 3. replace Gerrit URLs with GERRIT_URL
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
# replace URL with URL
url_pattern <- "https?://[^\\s]+"
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
# 4. replace @screenname with SCREEN_NAME
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
return(cleaned_message)
}
# Add comment_type column to each df
neurobiber_description_pca_df$comment_type <- "task_description"
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
#clean the messages
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>%
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
paste(neurobiber_description_pca_df$AuthorPHID,
neurobiber_description_pca_df$TaskPHID)))
# look at correlation between PC1, PC2, and different outcome variables
library(dplyr)
description_anova_results <- neurobiber_description_pca_df %>%
group_by(source) %>%
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
description_anova_results
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
group_by(source) %>%
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
discussion_anova_results
# look at the representative comments for PC1 and PC2
top5 <- neurobiber_subcomment_pca_df %>%
arrange(desc(PC2)) %>%
slice(300:310) %>%
pull(cleaned_comment)
bottom5 <- neurobiber_subcomment_pca_df %>%
arrange(PC2) %>%
slice(300:310) %>%
pull(cleaned_comment)
cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)
cat("\nBottom 300:310 comment_text by PC1 score:\n")
print(bottom5)
library(scales)
library(ggplot2)
affiliationColors <-
setNames( c('#5da2d8', '#c7756a')
,c("False", "True"))
neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>%
left_join(
neurobiber_description_pca_df %>%
select(TaskPHID, priority),
by = "TaskPHID"
) |>
filter(priority.y %in% c("Lowest","Unbreak Now!"))
neurobiber_description_pca_df <- neurobiber_description_pca_df |>
filter(priority %in% c("Lowest","Unbreak Now!"))
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
ggplot(neurobiber_description_pca_df, aes(x = PC3, y = PC8, fill = priority)) +
geom_point(shape = 21, alpha=0.4, size=2) +
facet_grid(source ~ phase, scales="fixed") +
xlim(-10, 10) +
ylim(-10, 10) +
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
theme_minimal() +
labs(
title = "PCs for Task Descriptions (Faceted by Source and Phase)",
x = "PC3",
y = "PC8",
fill = "author_same_as_task_creator?"
)