1
0

updating new analysis with re-labeled data, gerrit is out and bzimport is its own thing

This commit is contained in:
Matthew Gaughan 2025-12-16 17:55:51 -08:00
parent df1dcf1224
commit 1584e2cd5f
21 changed files with 381134 additions and 322 deletions

View File

@ -82,19 +82,20 @@ main_df <- main_df |>
)
#getting PC values (need todo after revised pass)
pca_csv <- "~/analysis_data/121625_constituent_dfs/121525_total_pca_df.csv"
pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv"
pca_df <- read.csv(pca_csv, header = TRUE)
length(unique(pca_df$id))
pca_df <- pca_df |>
select(starts_with("PC"),
id)
#first_join <- main_df|>
# left_join(
# pca_df,
# by = "id"
# )
first_join <- main_df|>
left_join(
pca_df,
by = "id"
)
length(unique(first_join$id))
olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE)
@ -103,7 +104,7 @@ olmo_df <- olmo_df |>
olmo_sentence_labels = sentence_categories)|>
select(id, olmo_cleaned_sentences, olmo_sentence_labels)
second_join <- main_df|>
second_join <- first_join |>
left_join(
olmo_df,
by = "id"
@ -163,4 +164,4 @@ unified_df <- unified_df |>
gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+")
)
write.csv(unified_df, "forPCA_121625_unified.csv", row.names = FALSE)
write.csv(unified_df, "121625_unified.csv", row.names = FALSE)

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 560 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 484 KiB

3130
dsl/121625_DSL_frame.csv Normal file

File diff suppressed because it is too large Load Diff

100
dsl/121625_final_dsl.R Normal file
View File

@ -0,0 +1,100 @@
library(tidyverse)
library(dsl)
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>
dplyr::mutate(ttr_days = TTR_hours / 24) |>
dplyr::mutate(task_resolution = dsl_score)
dev_model <- dsl(
model = "logit",
formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
+ median_PC4_adac + median_PC3_adac + n_comments_before
+ median_gerrit_reviewers + week_index + as.factor(isAuthorWMF) * as.factor(source),
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
sample_prob = "sampling_prob",
cluster="source",
cross_fit = 3,
sample_split = 20,
data=dsl_df
)
summary(dev_model)
#saveRDS(dev_model, "121625_logit_dsl.RDS")
dev_model <- readRDS("dsl/121625_logit_dsl.RDS")
library(broom)
library(dplyr)
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
terms <- row.names(res)
cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
if (conf.int) {
cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
}
out <- as.list(res)[cols]
names(out) <- names(cols)
out <- as_tibble(as.data.frame(out))
out <- dplyr::bind_cols(term = terms, out)
if (exponentiate)
out <- broom:::exponentiate(out)
return(out)
}
coef_df <- tidy.dsl(dev_model)
coef_df <- coef_df |>
mutate(
term = recode(term,
"week_index" = "Weeks from deployment",
"(Intercept)" = "Intercept",
"n_comments_before" = "# of comments prior to resolution",
"median_PC4_adac" = "Median Author PC4 Pre-resolution",
"median_PC3_adac" = "Median Author PC3 Pre-resolution",
"median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
"human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
"human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'",
"human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'",
"as.factor(source)c3" = "HTTP-deprecation (factor)",
"as.factor(source)c2" = "HTTPS-login (factor)",
"as.factor(isAuthorWMF)TRUE" = "WMF-affiliated Author (factor)",
"as.factor(isAuthorWMF)FALSE" = "Nonaffiliated Author (factor)",
"as.factor(isAuthorWMF)FALSE:as.factor(source)c2" = "Nonaffiliated Author:HTTPS-login",
"as.factor(isAuthorWMF)FALSE:as.factor(source)c3" = "Nonaffiliated Author:HTTP-deprecation",
"as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliated Author:HTTPS-login",
"as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliated Author:HTTP-deprecation",
),
term = factor(term, levels = rev(c(
"Intercept",
"% of sentences discussing 'Existent Problems'",
"% of sentences discussing 'Solutions'",
"% of sentences discussing 'Record Keeping'",
"Median Author PC4 Pre-resolution",
"Median Author PC3 Pre-resolution",
"# of comments prior to resolution",
"Median # of Code Reviewers (Gerrit)",
"Weeks from deployment",
"HTTPS-login (factor)",
"HTTP-deprecation (factor)",
"Nonaffiliated Author (factor)",
"WMF-affiliated Author (factor)",
"Nonaffiliated Author:HTTPS-login",
"WMF-affiliated Author:HTTPS-login",
"Nonaffiliated Author:HTTP-deprecation",
"WMF-affiliated Author:HTTP-deprecation"
)))
)
dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
geom_point(size = 1) +
geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Log-odds Coefficient Estimate",
y = "Variable") +
theme_minimal()
dsl_coefs
ggsave(
filename = "121625_dsl_coefs.png",
plot = dsl_coefs,
width = 6, # inches
height = 6, # inches
dpi = 800 # high resolution
)

BIN
dsl/121625_logit_dsl.RDS Normal file

Binary file not shown.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

@ -1,7 +1,7 @@
library(tidyverse)
library(dsl)
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>
@ -69,7 +69,7 @@ summary(felm_model)
dev_model <- dsl(
model = "logit",
formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
+ median_PC4_adac + median_PC3_adac + n_comments_before
+ median_PC4_adac + median_PC3_adac + median_PC1_adac + n_comments_before
+ median_gerrit_reviewers + median_gerrit_loc_delta
+ week_index + as.factor(isAuthorWMF) * as.factor(source),
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
@ -80,9 +80,9 @@ dev_model <- dsl(
sample_split = 20,
data=dsl_df
)
#summary(dev_model)
summary(dev_model)
#saveRDS(dev_model, "120725_logit_dsl.RDS")
dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
#dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
library(broom)
library(dplyr)
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
@ -109,6 +109,7 @@ coef_df <- coef_df |>
"n_comments_before" = "# of comments prior to resolution",
"median_PC4_adac" = "Median Author PC4 Pre-resolution",
"median_PC3_adac" = "Median Author PC3 Pre-resolution",
"median_PC1_adac" = "Median Author PC1 Pre-resolution",
"median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
"median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)",
"human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
@ -127,6 +128,7 @@ coef_df <- coef_df |>
"% of sentences discussing 'Record Keeping'",
"Median Author PC4 Pre-resolution",
"Median Author PC3 Pre-resolution",
"Median Author PC1 Pre-resolution",
"# of comments prior to resolution",
"Median # of Code Reviewers (Gerrit)",
"Median LoC Changed (Gerrit)",

View File

@ -1,6 +1,6 @@
library(tidyverse)
unified_csv <-"~/analysis_data/120725_unified.csv"
unified_csv <-"~/analysis_data/121625_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE)
# 1. aggregate to the task level
@ -9,7 +9,7 @@ unified_df <- read.csv(unified_csv, header = TRUE)
# 1c.
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', 'FUTURE PLANS',
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
'SOCIAL CONVERSATION')
@ -204,15 +204,15 @@ task_level_variables <- unified_df |>
group_by(TaskPHID) |>
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
median_PC3 = median(PC3),
median_PC3_adac = median(PC3[ADAC==1]),
median_PC3_no_adac = median(PC3[ADAC==0]),
median_PC1 = median(PC1),
median_PC1_adac = median(PC1[ADAC==1]),
median_PC1_no_adac = median(PC1[ADAC==0]),
median_PC4 = median(PC4),
median_PC4_adac = median(PC4[ADAC==1]),
median_PC4_no_adac = median(PC4[ADAC==0]),
median_PC3 = median(PC3, na.rm = TRUE),
median_PC3_adac = median(PC3[ADAC==1], na.rm = TRUE),
median_PC3_no_adac = median(PC3[ADAC==0], na.rm = TRUE),
median_PC1 = median(PC1, na.rm = TRUE),
median_PC1_adac = median(PC1[ADAC==1], na.rm = TRUE),
median_PC1_no_adac = median(PC1[ADAC==0], na.rm = TRUE),
median_PC4 = median(PC4, na.rm = TRUE),
median_PC4_adac = median(PC4[ADAC==1], na.rm = TRUE),
median_PC4_no_adac = median(PC4[ADAC==0], na.rm = TRUE),
n_comments = sum(!is.na(id)),
n_comments_before = sum(before_close)
)
@ -221,7 +221,7 @@ descriptions <- unified_df |>
filter(comment_type == "task_description")|>
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
source, phase, week_index, author_closer, resolution_outcome, priority,
gerrit_repo, task_status)
gerrit_repo, status)
task_level_variables <- task_level_variables |>
left_join(
@ -242,7 +242,7 @@ task_level_variables <- task_level_variables |>
)
# 2. assign sampling prob for different tasks
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
# refer to DSL specification sheet
@ -258,37 +258,10 @@ task_level_variables <- task_level_variables |>
) |>
select(-isFirstSample) |>
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
mutate(TTR = (date_closed - date_created)/3600)
mutate(TTR_hours = (date_closed - date_created)/3600)
# 3. check validity of different aggregate variables
mean(task_level_variables$sampling_prob)
table(task_level_variables$resolution_outcome)
# look at bivariate plots
ggplot(task_level_variables, aes(
x = as.factor(source),
y = week_index,
fill = resolution_outcome
)) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of week_index against Resolution Outcome",
x = "Case",
y = "Week Index",
fill = "Resolution Outcome"
)
ggplot(task_level_variables,
aes(
x=as.factor(source),
y=olmo_RK_prop,
fill=as.factor(source)
)) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
scale_fill_viridis_d() +
theme_minimal()
# 4. save
write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE)
write.csv(task_level_variables, "121625_DSL_frame.csv", row.names = FALSE)

View File

@ -1,7 +1,7 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_csv <-"~/dsl/126725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>

53
dsl/rq2_plot.R Normal file
View File

@ -0,0 +1,53 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>
filter(isAuthorWMF != "BzImport")
dsl_df_long <- dsl_df %>%
pivot_longer(
cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
names_to = "tag",
values_to = "proportion"
) %>%
mutate(tag = gsub("olmo_|_prop_adac", "", tag),
tag = case_when(
tag == "EP" ~ "Existent Problem",
tag == "RK" ~ "Record Keeping",
tag =="TSOL" ~ "Solutions"
))
olmo_comparison <- ggplot(
dsl_df_long,
aes(
x = tag,
y = proportion,
fill = isAuthorWMF,
)
) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_boxplot() +
theme_minimal() +
scale_fill_viridis_d() +
labs(
x = "Tag",
y = "% of sentences tagged",
color = "Is Author WMF?",
fill = "Is Author WMF?"
) +
theme(legend.position = "top")
olmo_comparison
ggsave(
filename = "121625_machine_label_comparison.png",
plot = olmo_comparison,
width = 12, # inches
height = 6, # inches
dpi = 800 # high resolution
)

View File

@ -1,17 +0,0 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:46483 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: NeI7LSiR2rI9GCHZLNWB
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 31856137

View File

@ -1,30 +1,65 @@
library(tidyverse)
library(dplyr)
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
main_csv <- "~/analysis_data/120725_unified.csv"
main_csv <- "~/analysis_data/121625_unified.csv"
main_df <- read.csv(main_csv , header = TRUE)
length(unique(main_df$id))
main_df |>
preprocess_comment <- function(message) {
library(stringr)
comment_text <- message
# 1. replace code with CODE
# Inline code: `...`
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
# Block code: ```...```
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
# 2. replace quotes with QUOTE
lines <- unlist(strsplit(comment_text, "\n"))
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
comment_text <- paste(lines, collapse = "\n")
# 3. replace Gerrit URLs with GERRIT_URL
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
# replace URL with URL
url_pattern <- "https?://[^\\s]+"
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
# 4. replace @screenname with SCREEN_NAME
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
return(cleaned_message)
}
main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)
# look at the representative comments for PC1 and PC2
top5 <- main_df %>%
arrange(desc(PC3)) %>%
slice(250:260) %>%
pull(cleaned_comment)
bottom5 <- main_df %>%
arrange(PC3) %>%
slice(250:260) %>%
pull(cleaned_comment)
cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)
cat("\nBottom 300:310 comment_text by PC2 score:\n")
print(bottom5)
comments_style <- main_df |>
ggplot(
aes(
x = PC4,
y = PC3,
x = PC1,
y = PC4,
fill = comment_type
)
) +
facet_grid(~source, scales="fixed",
labeller = as_labeller(c(
"c1" = "VisualEditor (c1)",
"c2" = "HTTPS-as-default (c2)",
"c3" = "HTTP-deprecation (c3)"
"c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"
))) +
geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-50, 50) +
@ -36,40 +71,54 @@ main_df |>
theme_minimal() +
theme(legend.position = "top") +
labs(
title = "PCs for Task Comments by comment type and case",
x = "Casual v. Formal Updates (PC3)",
y = "Technical-matter v. Procedural Commentary (PC4)",
x = "Lengthy Discussion v. Brief Updates (PC1)",
y = "Technical Jargon v. Non-technical Observations (PC4)",
)
ggsave(
filename = "121625_comments_style.png",
plot = comments_style,
width = 12, # inches
height = 8, # inches
dpi = 800 # high resolution
)
main_df |>
adac_style <- main_df |>
filter(ADAC == 1) |>
ggplot(
aes(
x = PC4,
y = PC3,
fill = as.factor(ADAC)
x = PC3,
y = PC4,
fill = as.factor(isAuthorWMF)
)
) +
facet_grid(comment_type~source,
facet_grid(~source,
labeller = as_labeller(c(
"c1" = "VisualEditor (c1)",
"c2" = "HTTPS-as-default (c2)",
"c3" = "HTTP-deprecation (c3)",
"c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation",
"task_description" = "Task Description",
"task_subcomment" = "Follow-up Reply"
))) +
geom_point(shape = 21, alpha=0.3, size=2) +
scale_fill_viridis_d(
name = "Comment Author Affiliation",
labels = c("Nonaffiliated", "WMF-affiliated"))+
xlim(-50, 50) +
ylim(-50, 50) +
scale_fill_viridis_d()+
theme_minimal() +
theme(legend.position = "top") +
labs(
title = "PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)",
x = "Casual v. Formal Updates (PC3)",
y = "Technical-matter v. Procedural Commentary (PC4)",
x = "Expressive, first-person v. Dry, third-person (PC3)",
y = "Technical Jargon v. Non-technical Observations (PC4)",
)
#"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
ggsave(
filename = "121625_adac_affil_style.png",
plot = adac_style,
width = 12, # inches
height = 8, # inches
dpi = 800 # high resolution
)
main_df |>
filter(comment_type=="task_subcomment") |>
@ -104,221 +153,3 @@ main_df <- main_df |>
comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
)
description_df <- main_df |>
filter(comment_type == "task_description")
replies_df <- main_df |>
filter(comment_type == "task_subcomment") |>
filter(isGerritBot != TRUE)
library(ggplot2)
ggplot(replies_df, aes(x = PC3, y = PC4, fill = isAuthorWMF)) +
facet_grid(ADAC~source, scales="fixed") +
geom_point(shape = 21, alpha=0.15, size=3) +
xlim(-50, 50) +
ylim(-50, 50) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "PCs for Task Comments (Faceted by source (column))",
x = "PC3",
y = "PC4",
)
replies_df |>
ggplot(aes(
x = as.factor(author_closer.y), # x-axis grouping
y = PC1.x,
fill = reso
)) +
ylim(-30, 30) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source.x, scales = "fixed") +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
description_df |>
ggplot(aes(
x = as.factor(author_closer), # x-axis grouping
y = PC4,
fill = resolution_outcome
)) +
facet_grid( ~ source, scales = "fixed") +
ylim(-40, 40) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
main_df <- main_df |>
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
# Join main_df to neurobiber_description_pca_df
description_joined <- main_df |>
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
# Join main_df to neurobiber_subcomment_pca_df
subcomment_joined <- main_df |>
right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
total_joined <- main_df |>
right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
preprocess_comment <- function(message) {
library(stringr)
comment_text <- message
# 1. replace code with CODE
# Inline code: `...`
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
# Block code: ```...```
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
# 2. replace quotes with QUOTE
lines <- unlist(strsplit(comment_text, "\n"))
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
comment_text <- paste(lines, collapse = "\n")
# 3. replace Gerrit URLs with GERRIT_URL
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
# replace URL with URL
url_pattern <- "https?://[^\\s]+"
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
# 4. replace @screenname with SCREEN_NAME
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
return(cleaned_message)
}
# Add comment_type column to each df
neurobiber_description_pca_df$comment_type <- "task_description"
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
#clean the messages
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
subcomment_joined <- subcomment_joined %>%
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
paste(neurobiber_description_pca_df$AuthorPHID,
neurobiber_description_pca_df$TaskPHID)))
# look at correlation between PC1, PC2, and different outcome variables
description_anova_results <- neurobiber_description_pca_df %>%
group_by(source) %>%
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
description_anova_results
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
group_by(source) %>%
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
discussion_anova_results
# look at the representative comments for PC1 and PC2
top5 <- total_joined %>%
arrange(desc(PC4)) %>%
slice(300:310) %>%
pull(cleaned_comment)
bottom5 <- total_joined %>%
arrange(PC4) %>%
slice(300:310) %>%
pull(cleaned_comment)
cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)
cat("\nBottom 300:310 comment_text by PC2 score:\n")
print(bottom5)
library(scales)
library(ggplot2)
affiliationColors <-
setNames( c('#5da2d8', '#c7756a')
,c("False", "True"))
subcomment_joined_no_gerrit <- subcomment_joined |>
filter(isGerritBot != "TRUE") |>
left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
# geom_point(shape = 21, alpha=0.4, size=2) +
# geom_bin_2d() +
sampled_authors <- subcomment_joined_no_gerrit %>%
distinct(AuthorPHID) %>%
sample_n(100) %>%
pull(AuthorPHID)
# 2. Filter original data to just those authors
sub_sample <- subcomment_joined_no_gerrit %>%
filter(AuthorPHID %in% sampled_authors)
description_sampled_authors <- description_joined %>%
distinct(AuthorPHID) %>%
sample_n(8) %>%
pull(AuthorPHID)
# 2. Filter original data to just those authors
description_sub_sample <- description_joined %>%
filter(AuthorPHID %in% description_sampled_authors)
ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
facet_grid(source~phase, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-30, 30) +
ylim(-30, 30) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "PCs for Task Comments (Faceted by source and phase)",
x = "PC4",
y = "PC3",
)
priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
mutate(priority = factor(priority, levels = priority_order))
description_joined <- description_joined %>%
mutate(priority = factor(priority.y, levels = priority_order))
ggplot(total_joined, aes(
x = as.factor(comment_type), # x-axis grouping
y = PC3,
fill = isAuthorWMF
)) +
ylim(-30, 30) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)