156 lines
4.3 KiB
R
156 lines
4.3 KiB
R
library(tidyverse)
|
|
library(dplyr)
|
|
|
|
main_csv <- "~/analysis_data/121625_unified.csv"
|
|
main_df <- read.csv(main_csv , header = TRUE)
|
|
length(unique(main_df$id))
|
|
|
|
preprocess_comment <- function(message) {
|
|
library(stringr)
|
|
comment_text <- message
|
|
# 1. replace code with CODE
|
|
# Inline code: `...`
|
|
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
|
|
# Block code: ```...```
|
|
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
|
|
# 2. replace quotes with QUOTE
|
|
lines <- unlist(strsplit(comment_text, "\n"))
|
|
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
|
|
comment_text <- paste(lines, collapse = "\n")
|
|
# 3. replace Gerrit URLs with GERRIT_URL
|
|
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
|
|
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
|
|
# replace URL with URL
|
|
url_pattern <- "https?://[^\\s]+"
|
|
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
|
|
# 4. replace @screenname with SCREEN_NAME
|
|
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
|
|
return(cleaned_message)
|
|
}
|
|
main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)
|
|
|
|
# look at the representative comments for PC1 and PC2
|
|
top5 <- main_df %>%
|
|
arrange(desc(PC3)) %>%
|
|
slice(250:260) %>%
|
|
pull(cleaned_comment)
|
|
|
|
bottom5 <- main_df %>%
|
|
arrange(PC3) %>%
|
|
slice(250:260) %>%
|
|
pull(cleaned_comment)
|
|
|
|
cat("Top 300:310 comment_text by PC2 score:\n")
|
|
print(top5)
|
|
|
|
cat("\nBottom 300:310 comment_text by PC2 score:\n")
|
|
print(bottom5)
|
|
|
|
|
|
comments_style <- main_df |>
|
|
ggplot(
|
|
aes(
|
|
x = PC1,
|
|
y = PC4,
|
|
fill = comment_type
|
|
)
|
|
) +
|
|
facet_grid(~source, scales="fixed",
|
|
labeller = as_labeller(c(
|
|
"c1" = "VisualEditor",
|
|
"c2" = "HTTPS-login",
|
|
"c3" = "HTTP-deprecation"
|
|
))) +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
xlim(-50, 50) +
|
|
ylim(-50, 50) +
|
|
scale_fill_viridis_d(
|
|
option = "magma",
|
|
name = "Comment type",
|
|
labels = c("Task Description", "Reply"))+
|
|
theme_minimal() +
|
|
theme(legend.position = "top") +
|
|
labs(
|
|
x = "Lengthy Discussion v. Brief Updates (PC1)",
|
|
y = "Technical Jargon v. Non-technical Observations (PC4)",
|
|
)
|
|
ggsave(
|
|
filename = "121625_comments_style.png",
|
|
plot = comments_style,
|
|
width = 12, # inches
|
|
height = 8, # inches
|
|
dpi = 800 # high resolution
|
|
)
|
|
|
|
adac_style <- main_df |>
|
|
filter(ADAC == 1) |>
|
|
ggplot(
|
|
aes(
|
|
x = PC3,
|
|
y = PC4,
|
|
fill = as.factor(isAuthorWMF)
|
|
)
|
|
) +
|
|
facet_grid(~source,
|
|
labeller = as_labeller(c(
|
|
"c1" = "VisualEditor",
|
|
"c2" = "HTTPS-login",
|
|
"c3" = "HTTP-deprecation",
|
|
"task_description" = "Task Description",
|
|
"task_subcomment" = "Follow-up Reply"
|
|
))) +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
xlim(-50, 50) +
|
|
ylim(-50, 50) +
|
|
scale_fill_viridis_d()+
|
|
theme_minimal() +
|
|
theme(legend.position = "top") +
|
|
labs(
|
|
x = "Expressive, first-person v. Dry, third-person (PC3)",
|
|
y = "Technical Jargon v. Non-technical Observations (PC4)",
|
|
)
|
|
#"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
|
|
ggsave(
|
|
filename = "121625_adac_affil_style.png",
|
|
plot = adac_style,
|
|
width = 12, # inches
|
|
height = 8, # inches
|
|
dpi = 800 # high resolution
|
|
)
|
|
|
|
|
|
|
|
main_df |>
|
|
filter(comment_type=="task_subcomment") |>
|
|
ggplot(
|
|
aes(
|
|
x = PC4,
|
|
y = PC3,
|
|
fill = as.factor(ADAC)
|
|
)
|
|
) +
|
|
facet_grid(ADAC~source,
|
|
labeller = as_labeller(c(
|
|
"c1" = "VisualEditor (c1)",
|
|
"c2" = "HTTPS-as-default (c2)",
|
|
"c3" = "HTTP-deprecation (c3)"
|
|
))) +
|
|
geom_point(shape = 21, alpha=0.13, size=2) +
|
|
scale_fill_viridis_d(
|
|
option = "inferno",
|
|
name = "By Task Author Before Resolution",
|
|
labels = c("No", "Yes"))+
|
|
theme_minimal() +
|
|
theme(legend.position = "top") +
|
|
labs(
|
|
title = "PCs for Replies (by Author Affiliation, Case, and Comment Type)",
|
|
x = "Casual v. Formal Updates (PC3)",
|
|
y = "Technical-matter v. Procedural Commentary (PC4)",
|
|
)
|
|
|
|
main_df <- main_df |>
|
|
mutate(
|
|
comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
|
|
)
|
|
|