242 lines
7.6 KiB
R
242 lines
7.6 KiB
R
library(tidyverse)
|
|
library(dplyr)
|
|
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
|
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
|
|
|
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
|
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
|
|
|
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
|
|
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
|
|
|
|
main_csv <- "~/analysis_data/102725_unified.csv"
|
|
main_df <- read.csv(main_csv , header = TRUE)
|
|
|
|
main_df <- main_df |>
|
|
mutate(
|
|
comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+"))
|
|
)
|
|
|
|
|
|
|
|
|
|
description_df <- main_df |>
|
|
filter(comment_type == "task_description")
|
|
|
|
replies_df <- main_df |>
|
|
filter(comment_type == "task_subcomment") |>
|
|
filter(isGerritBot != TRUE) |>
|
|
left_join(
|
|
description_df,
|
|
by="TaskPHID"
|
|
)
|
|
|
|
|
|
ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) +
|
|
facet_grid(source~phase, scales="fixed") +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
xlim(-30, 30) +
|
|
ylim(-30, 30) +
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "PCs for Task Comments (Faceted by source and phase)",
|
|
x = "PC4",
|
|
y = "PC3",
|
|
)
|
|
|
|
|
|
replies_df |>
|
|
ggplot(aes(
|
|
x = as.factor(author_closer.y), # x-axis grouping
|
|
y = PC1.x,
|
|
fill = reso
|
|
)) +
|
|
ylim(-30, 30) +
|
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
|
facet_grid(. ~ source.x, scales = "fixed") +
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Boxplot of PC4",
|
|
x = "Comment_type",
|
|
y = "PC4",
|
|
fill = "isAuthorWMF?"
|
|
)
|
|
|
|
description_df |>
|
|
ggplot(aes(
|
|
x = as.factor(author_closer), # x-axis grouping
|
|
y = PC4,
|
|
fill = resolution_outcome
|
|
)) +
|
|
facet_grid( ~ source, scales = "fixed") +
|
|
ylim(-40, 40) +
|
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Boxplot of PC4",
|
|
x = "Comment_type",
|
|
y = "PC4",
|
|
fill = "isAuthorWMF?"
|
|
)
|
|
|
|
|
|
main_df <- main_df |>
|
|
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
|
|
# Join main_df to neurobiber_description_pca_df
|
|
description_joined <- main_df |>
|
|
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
|
filter(comment_text != "nan") #TODO: look at this more in depth
|
|
|
|
# Join main_df to neurobiber_subcomment_pca_df
|
|
subcomment_joined <- main_df |>
|
|
right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
|
filter(comment_text != "nan") #TODO: look at this more in depth
|
|
|
|
total_joined <- main_df |>
|
|
right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
|
filter(comment_text != "nan") #TODO: look at this more in depth
|
|
|
|
|
|
preprocess_comment <- function(message) {
|
|
library(stringr)
|
|
comment_text <- message
|
|
# 1. replace code with CODE
|
|
# Inline code: `...`
|
|
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
|
|
# Block code: ```...```
|
|
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
|
|
# 2. replace quotes with QUOTE
|
|
lines <- unlist(strsplit(comment_text, "\n"))
|
|
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
|
|
comment_text <- paste(lines, collapse = "\n")
|
|
# 3. replace Gerrit URLs with GERRIT_URL
|
|
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
|
|
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
|
|
# replace URL with URL
|
|
url_pattern <- "https?://[^\\s]+"
|
|
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
|
|
# 4. replace @screenname with SCREEN_NAME
|
|
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
|
|
return(cleaned_message)
|
|
}
|
|
|
|
# Add comment_type column to each df
|
|
neurobiber_description_pca_df$comment_type <- "task_description"
|
|
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
|
|
|
|
#clean the messages
|
|
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
|
|
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
|
|
total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
|
|
|
|
subcomment_joined <- subcomment_joined %>%
|
|
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
|
|
paste(neurobiber_description_pca_df$AuthorPHID,
|
|
neurobiber_description_pca_df$TaskPHID)))
|
|
|
|
# look at correlation between PC1, PC2, and different outcome variables
|
|
description_anova_results <- neurobiber_description_pca_df %>%
|
|
group_by(source) %>%
|
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
|
description_anova_results
|
|
|
|
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
|
|
group_by(source) %>%
|
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
|
discussion_anova_results
|
|
|
|
# look at the representative comments for PC1 and PC2
|
|
top5 <- total_joined %>%
|
|
arrange(desc(PC4)) %>%
|
|
slice(300:310) %>%
|
|
pull(cleaned_comment)
|
|
|
|
bottom5 <- total_joined %>%
|
|
arrange(PC4) %>%
|
|
slice(300:310) %>%
|
|
pull(cleaned_comment)
|
|
|
|
cat("Top 300:310 comment_text by PC2 score:\n")
|
|
print(top5)
|
|
|
|
cat("\nBottom 300:310 comment_text by PC2 score:\n")
|
|
print(bottom5)
|
|
|
|
|
|
library(scales)
|
|
library(ggplot2)
|
|
|
|
|
|
affiliationColors <-
|
|
setNames( c('#5da2d8', '#c7756a')
|
|
,c("False", "True"))
|
|
|
|
subcomment_joined_no_gerrit <- subcomment_joined |>
|
|
filter(isGerritBot != "TRUE") |>
|
|
left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
|
|
|
|
|
|
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
|
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
|
# geom_point(shape = 21, alpha=0.4, size=2) +
|
|
# geom_bin_2d() +
|
|
|
|
sampled_authors <- subcomment_joined_no_gerrit %>%
|
|
distinct(AuthorPHID) %>%
|
|
sample_n(100) %>%
|
|
pull(AuthorPHID)
|
|
|
|
# 2. Filter original data to just those authors
|
|
sub_sample <- subcomment_joined_no_gerrit %>%
|
|
filter(AuthorPHID %in% sampled_authors)
|
|
|
|
description_sampled_authors <- description_joined %>%
|
|
distinct(AuthorPHID) %>%
|
|
sample_n(8) %>%
|
|
pull(AuthorPHID)
|
|
|
|
# 2. Filter original data to just those authors
|
|
description_sub_sample <- description_joined %>%
|
|
filter(AuthorPHID %in% description_sampled_authors)
|
|
|
|
ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
|
|
facet_grid(source~phase, scales="fixed") +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
xlim(-30, 30) +
|
|
ylim(-30, 30) +
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "PCs for Task Comments (Faceted by source and phase)",
|
|
x = "PC4",
|
|
y = "PC3",
|
|
)
|
|
|
|
priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
|
|
|
|
subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
|
|
mutate(priority = factor(priority, levels = priority_order))
|
|
|
|
description_joined <- description_joined %>%
|
|
mutate(priority = factor(priority.y, levels = priority_order))
|
|
|
|
ggplot(total_joined, aes(
|
|
x = as.factor(comment_type), # x-axis grouping
|
|
y = PC3,
|
|
fill = isAuthorWMF
|
|
)) +
|
|
ylim(-30, 30) +
|
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
|
scale_fill_viridis_d() +
|
|
theme_minimal() +
|
|
labs(
|
|
title = "Boxplot of PC4",
|
|
x = "Comment_type",
|
|
y = "PC4",
|
|
fill = "isAuthorWMF?"
|
|
)
|