library(tidyverse) library(dplyr) main_csv <- "~/analysis_data/121625_unified.csv" main_df <- read.csv(main_csv , header = TRUE) length(unique(main_df$id)) preprocess_comment <- function(message) { library(stringr) comment_text <- message # 1. replace code with CODE # Inline code: `...` comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE") # Block code: ```...``` comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE") # 2. replace quotes with QUOTE lines <- unlist(strsplit(comment_text, "\n")) lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines) comment_text <- paste(lines, collapse = "\n") # 3. replace Gerrit URLs with GERRIT_URL gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+" comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL") # replace URL with URL url_pattern <- "https?://[^\\s]+" comment_text <- str_replace_all(comment_text, url_pattern, "URL") # 4. replace @screenname with SCREEN_NAME cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME") return(cleaned_message) } main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment) # look at the representative comments for PC1 and PC2 top5 <- main_df %>% arrange(desc(PC3)) %>% slice(250:260) %>% pull(cleaned_comment) bottom5 <- main_df %>% arrange(PC3) %>% slice(250:260) %>% pull(cleaned_comment) cat("Top 300:310 comment_text by PC2 score:\n") print(top5) cat("\nBottom 300:310 comment_text by PC2 score:\n") print(bottom5) comments_style <- main_df |> ggplot( aes( x = PC1, y = PC4, fill = comment_type ) ) + facet_grid(~source, scales="fixed", labeller = as_labeller(c( "c1" = "VisualEditor", "c2" = "HTTPS-login", "c3" = "HTTP-deprecation" ))) + geom_point(shape = 21, alpha=0.3, size=2) + xlim(-50, 50) + ylim(-50, 50) + scale_fill_viridis_d( option = "magma", name = "Comment type", labels = c("Task Description", "Reply"))+ theme_minimal() + theme(legend.position = "top") + labs( x = "Lengthy Discussion v. Brief Updates (PC1)", y = "Technical Jargon v. Non-technical Observations (PC4)", ) ggsave( filename = "121625_comments_style.png", plot = comments_style, width = 12, # inches height = 8, # inches dpi = 800 # high resolution ) adac_style <- main_df |> filter(ADAC == 1) |> ggplot( aes( x = PC3, y = PC4, fill = as.factor(isAuthorWMF) ) ) + facet_grid(~source, labeller = as_labeller(c( "c1" = "VisualEditor", "c2" = "HTTPS-login", "c3" = "HTTP-deprecation", "task_description" = "Task Description", "task_subcomment" = "Follow-up Reply" ))) + geom_point(shape = 21, alpha=0.3, size=2) + xlim(-50, 50) + ylim(-50, 50) + scale_fill_viridis_d()+ theme_minimal() + theme(legend.position = "top") + labs( x = "Expressive, first-person v. Dry, third-person (PC3)", y = "Technical Jargon v. Non-technical Observations (PC4)", ) #"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)" ggsave( filename = "121625_adac_affil_style.png", plot = adac_style, width = 12, # inches height = 8, # inches dpi = 800 # high resolution ) main_df |> filter(comment_type=="task_subcomment") |> ggplot( aes( x = PC4, y = PC3, fill = as.factor(ADAC) ) ) + facet_grid(ADAC~source, labeller = as_labeller(c( "c1" = "VisualEditor (c1)", "c2" = "HTTPS-as-default (c2)", "c3" = "HTTP-deprecation (c3)" ))) + geom_point(shape = 21, alpha=0.13, size=2) + scale_fill_viridis_d( option = "inferno", name = "By Task Author Before Resolution", labels = c("No", "Yes"))+ theme_minimal() + theme(legend.position = "top") + labs( title = "PCs for Replies (by Author Affiliation, Case, and Comment Type)", x = "Casual v. Formal Updates (PC3)", y = "Technical-matter v. Procedural Commentary (PC4)", ) main_df <- main_df |> mutate( comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+")) )