1
0

updating new analysis with re-labeled data, gerrit is out and bzimport is its own thing

This commit is contained in:
Matthew Gaughan 2025-12-16 17:55:51 -08:00
parent df1dcf1224
commit 1584e2cd5f
21 changed files with 381134 additions and 322 deletions

View File

@ -82,19 +82,20 @@ main_df <- main_df |>
) )
#getting PC values (need todo after revised pass) #getting PC values (need todo after revised pass)
pca_csv <- "~/analysis_data/121625_constituent_dfs/121525_total_pca_df.csv" pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv"
pca_df <- read.csv(pca_csv, header = TRUE) pca_df <- read.csv(pca_csv, header = TRUE)
length(unique(pca_df$id)) length(unique(pca_df$id))
pca_df <- pca_df |> pca_df <- pca_df |>
select(starts_with("PC"), select(starts_with("PC"),
id) id)
#first_join <- main_df|> first_join <- main_df|>
# left_join( left_join(
# pca_df, pca_df,
# by = "id" by = "id"
# ) )
length(unique(first_join$id))
olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv" olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv"
olmo_df <- read.csv(olmo_csv, header = TRUE) olmo_df <- read.csv(olmo_csv, header = TRUE)
@ -103,7 +104,7 @@ olmo_df <- olmo_df |>
olmo_sentence_labels = sentence_categories)|> olmo_sentence_labels = sentence_categories)|>
select(id, olmo_cleaned_sentences, olmo_sentence_labels) select(id, olmo_cleaned_sentences, olmo_sentence_labels)
second_join <- main_df|> second_join <- first_join |>
left_join( left_join(
olmo_df, olmo_df,
by = "id" by = "id"
@ -163,4 +164,4 @@ unified_df <- unified_df |>
gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+") gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+")
) )
write.csv(unified_df, "forPCA_121625_unified.csv", row.names = FALSE) write.csv(unified_df, "121625_unified.csv", row.names = FALSE)

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 560 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 484 KiB

3130
dsl/121625_DSL_frame.csv Normal file

File diff suppressed because it is too large Load Diff

100
dsl/121625_final_dsl.R Normal file
View File

@ -0,0 +1,100 @@
library(tidyverse)
library(dsl)
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>
dplyr::mutate(ttr_days = TTR_hours / 24) |>
dplyr::mutate(task_resolution = dsl_score)
dev_model <- dsl(
model = "logit",
formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
+ median_PC4_adac + median_PC3_adac + n_comments_before
+ median_gerrit_reviewers + week_index + as.factor(isAuthorWMF) * as.factor(source),
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
sample_prob = "sampling_prob",
cluster="source",
cross_fit = 3,
sample_split = 20,
data=dsl_df
)
summary(dev_model)
#saveRDS(dev_model, "121625_logit_dsl.RDS")
dev_model <- readRDS("dsl/121625_logit_dsl.RDS")
library(broom)
library(dplyr)
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
terms <- row.names(res)
cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
if (conf.int) {
cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
}
out <- as.list(res)[cols]
names(out) <- names(cols)
out <- as_tibble(as.data.frame(out))
out <- dplyr::bind_cols(term = terms, out)
if (exponentiate)
out <- broom:::exponentiate(out)
return(out)
}
coef_df <- tidy.dsl(dev_model)
coef_df <- coef_df |>
mutate(
term = recode(term,
"week_index" = "Weeks from deployment",
"(Intercept)" = "Intercept",
"n_comments_before" = "# of comments prior to resolution",
"median_PC4_adac" = "Median Author PC4 Pre-resolution",
"median_PC3_adac" = "Median Author PC3 Pre-resolution",
"median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
"human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
"human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'",
"human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'",
"as.factor(source)c3" = "HTTP-deprecation (factor)",
"as.factor(source)c2" = "HTTPS-login (factor)",
"as.factor(isAuthorWMF)TRUE" = "WMF-affiliated Author (factor)",
"as.factor(isAuthorWMF)FALSE" = "Nonaffiliated Author (factor)",
"as.factor(isAuthorWMF)FALSE:as.factor(source)c2" = "Nonaffiliated Author:HTTPS-login",
"as.factor(isAuthorWMF)FALSE:as.factor(source)c3" = "Nonaffiliated Author:HTTP-deprecation",
"as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliated Author:HTTPS-login",
"as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliated Author:HTTP-deprecation",
),
term = factor(term, levels = rev(c(
"Intercept",
"% of sentences discussing 'Existent Problems'",
"% of sentences discussing 'Solutions'",
"% of sentences discussing 'Record Keeping'",
"Median Author PC4 Pre-resolution",
"Median Author PC3 Pre-resolution",
"# of comments prior to resolution",
"Median # of Code Reviewers (Gerrit)",
"Weeks from deployment",
"HTTPS-login (factor)",
"HTTP-deprecation (factor)",
"Nonaffiliated Author (factor)",
"WMF-affiliated Author (factor)",
"Nonaffiliated Author:HTTPS-login",
"WMF-affiliated Author:HTTPS-login",
"Nonaffiliated Author:HTTP-deprecation",
"WMF-affiliated Author:HTTP-deprecation"
)))
)
dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
geom_point(size = 1) +
geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Log-odds Coefficient Estimate",
y = "Variable") +
theme_minimal()
dsl_coefs
ggsave(
filename = "121625_dsl_coefs.png",
plot = dsl_coefs,
width = 6, # inches
height = 6, # inches
dpi = 800 # high resolution
)

BIN
dsl/121625_logit_dsl.RDS Normal file

Binary file not shown.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

@ -1,7 +1,7 @@
library(tidyverse) library(tidyverse)
library(dsl) library(dsl)
dsl_csv <-"~/dsl/120725_DSL_frame.csv" dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE) dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |> dsl_df <- dsl_df |>
@ -69,7 +69,7 @@ summary(felm_model)
dev_model <- dsl( dev_model <- dsl(
model = "logit", model = "logit",
formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
+ median_PC4_adac + median_PC3_adac + n_comments_before + median_PC4_adac + median_PC3_adac + median_PC1_adac + n_comments_before
+ median_gerrit_reviewers + median_gerrit_loc_delta + median_gerrit_reviewers + median_gerrit_loc_delta
+ week_index + as.factor(isAuthorWMF) * as.factor(source), + week_index + as.factor(isAuthorWMF) * as.factor(source),
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
@ -80,9 +80,9 @@ dev_model <- dsl(
sample_split = 20, sample_split = 20,
data=dsl_df data=dsl_df
) )
#summary(dev_model) summary(dev_model)
#saveRDS(dev_model, "120725_logit_dsl.RDS") #saveRDS(dev_model, "120725_logit_dsl.RDS")
dev_model <- readRDS("dsl/120725_logit_dsl.RDS") #dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
library(broom) library(broom)
library(dplyr) library(dplyr)
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) { tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
@ -109,6 +109,7 @@ coef_df <- coef_df |>
"n_comments_before" = "# of comments prior to resolution", "n_comments_before" = "# of comments prior to resolution",
"median_PC4_adac" = "Median Author PC4 Pre-resolution", "median_PC4_adac" = "Median Author PC4 Pre-resolution",
"median_PC3_adac" = "Median Author PC3 Pre-resolution", "median_PC3_adac" = "Median Author PC3 Pre-resolution",
"median_PC1_adac" = "Median Author PC1 Pre-resolution",
"median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)", "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
"median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)", "median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)",
"human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'", "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
@ -127,6 +128,7 @@ coef_df <- coef_df |>
"% of sentences discussing 'Record Keeping'", "% of sentences discussing 'Record Keeping'",
"Median Author PC4 Pre-resolution", "Median Author PC4 Pre-resolution",
"Median Author PC3 Pre-resolution", "Median Author PC3 Pre-resolution",
"Median Author PC1 Pre-resolution",
"# of comments prior to resolution", "# of comments prior to resolution",
"Median # of Code Reviewers (Gerrit)", "Median # of Code Reviewers (Gerrit)",
"Median LoC Changed (Gerrit)", "Median LoC Changed (Gerrit)",

View File

@ -1,6 +1,6 @@
library(tidyverse) library(tidyverse)
unified_csv <-"~/analysis_data/120725_unified.csv" unified_csv <-"~/analysis_data/121625_unified.csv"
unified_df <- read.csv(unified_csv, header = TRUE) unified_df <- read.csv(unified_csv, header = TRUE)
# 1. aggregate to the task level # 1. aggregate to the task level
@ -9,7 +9,7 @@ unified_df <- read.csv(unified_csv, header = TRUE)
# 1c. # 1c.
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR', valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION', 'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', 'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', 'FUTURE PLANS',
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE', 'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE', 'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
'SOCIAL CONVERSATION') 'SOCIAL CONVERSATION')
@ -204,15 +204,15 @@ task_level_variables <- unified_df |>
group_by(TaskPHID) |> group_by(TaskPHID) |>
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE), summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE), median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
median_PC3 = median(PC3), median_PC3 = median(PC3, na.rm = TRUE),
median_PC3_adac = median(PC3[ADAC==1]), median_PC3_adac = median(PC3[ADAC==1], na.rm = TRUE),
median_PC3_no_adac = median(PC3[ADAC==0]), median_PC3_no_adac = median(PC3[ADAC==0], na.rm = TRUE),
median_PC1 = median(PC1), median_PC1 = median(PC1, na.rm = TRUE),
median_PC1_adac = median(PC1[ADAC==1]), median_PC1_adac = median(PC1[ADAC==1], na.rm = TRUE),
median_PC1_no_adac = median(PC1[ADAC==0]), median_PC1_no_adac = median(PC1[ADAC==0], na.rm = TRUE),
median_PC4 = median(PC4), median_PC4 = median(PC4, na.rm = TRUE),
median_PC4_adac = median(PC4[ADAC==1]), median_PC4_adac = median(PC4[ADAC==1], na.rm = TRUE),
median_PC4_no_adac = median(PC4[ADAC==0]), median_PC4_no_adac = median(PC4[ADAC==0], na.rm = TRUE),
n_comments = sum(!is.na(id)), n_comments = sum(!is.na(id)),
n_comments_before = sum(before_close) n_comments_before = sum(before_close)
) )
@ -221,7 +221,7 @@ descriptions <- unified_df |>
filter(comment_type == "task_description")|> filter(comment_type == "task_description")|>
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
source, phase, week_index, author_closer, resolution_outcome, priority, source, phase, week_index, author_closer, resolution_outcome, priority,
gerrit_repo, task_status) gerrit_repo, status)
task_level_variables <- task_level_variables |> task_level_variables <- task_level_variables |>
left_join( left_join(
@ -242,7 +242,7 @@ task_level_variables <- task_level_variables |>
) )
# 2. assign sampling prob for different tasks # 2. assign sampling prob for different tasks
# need to ID those selected in the first round of sampling that were removed for the second round of sampling # need to ID those selected in the first round of sampling that were removed for the second round of sampling
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv" large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv"
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE) large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID)) first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
# refer to DSL specification sheet # refer to DSL specification sheet
@ -258,37 +258,10 @@ task_level_variables <- task_level_variables |>
) |> ) |>
select(-isFirstSample) |> select(-isFirstSample) |>
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |> mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
mutate(TTR = (date_closed - date_created)/3600) mutate(TTR_hours = (date_closed - date_created)/3600)
# 3. check validity of different aggregate variables # 3. check validity of different aggregate variables
mean(task_level_variables$sampling_prob) mean(task_level_variables$sampling_prob)
table(task_level_variables$resolution_outcome) table(task_level_variables$resolution_outcome)
# look at bivariate plots
ggplot(task_level_variables, aes(
x = as.factor(source),
y = week_index,
fill = resolution_outcome
)) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of week_index against Resolution Outcome",
x = "Case",
y = "Week Index",
fill = "Resolution Outcome"
)
ggplot(task_level_variables,
aes(
x=as.factor(source),
y=olmo_RK_prop,
fill=as.factor(source)
)) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
scale_fill_viridis_d() +
theme_minimal()
# 4. save # 4. save
write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE) write.csv(task_level_variables, "121625_DSL_frame.csv", row.names = FALSE)

View File

@ -1,7 +1,7 @@
library(tidyverse) library(tidyverse)
#library(dsl) #library(dsl)
library(dplyr) library(dplyr)
dsl_csv <-"~/dsl/120725_DSL_frame.csv" dsl_csv <-"~/dsl/126725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE) dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |> dsl_df <- dsl_df |>

53
dsl/rq2_plot.R Normal file
View File

@ -0,0 +1,53 @@
library(tidyverse)
#library(dsl)
library(dplyr)
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)
dsl_df <- dsl_df |>
filter(isAuthorWMF != "BzImport")
dsl_df_long <- dsl_df %>%
pivot_longer(
cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
names_to = "tag",
values_to = "proportion"
) %>%
mutate(tag = gsub("olmo_|_prop_adac", "", tag),
tag = case_when(
tag == "EP" ~ "Existent Problem",
tag == "RK" ~ "Record Keeping",
tag =="TSOL" ~ "Solutions"
))
olmo_comparison <- ggplot(
dsl_df_long,
aes(
x = tag,
y = proportion,
fill = isAuthorWMF,
)
) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_boxplot() +
theme_minimal() +
scale_fill_viridis_d() +
labs(
x = "Tag",
y = "% of sentences tagged",
color = "Is Author WMF?",
fill = "Is Author WMF?"
) +
theme(legend.position = "top")
olmo_comparison
ggsave(
filename = "121625_machine_label_comparison.png",
plot = olmo_comparison,
width = 12, # inches
height = 6, # inches
dpi = 800 # high resolution
)

View File

@ -1,17 +0,0 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:46483 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: NeI7LSiR2rI9GCHZLNWB
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 31856137

View File

@ -1,30 +1,65 @@
library(tidyverse) library(tidyverse)
library(dplyr) library(dplyr)
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv" main_csv <- "~/analysis_data/121625_unified.csv"
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
main_csv <- "~/analysis_data/120725_unified.csv"
main_df <- read.csv(main_csv , header = TRUE) main_df <- read.csv(main_csv , header = TRUE)
length(unique(main_df$id)) length(unique(main_df$id))
main_df |>
preprocess_comment <- function(message) {
library(stringr)
comment_text <- message
# 1. replace code with CODE
# Inline code: `...`
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
# Block code: ```...```
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
# 2. replace quotes with QUOTE
lines <- unlist(strsplit(comment_text, "\n"))
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
comment_text <- paste(lines, collapse = "\n")
# 3. replace Gerrit URLs with GERRIT_URL
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
# replace URL with URL
url_pattern <- "https?://[^\\s]+"
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
# 4. replace @screenname with SCREEN_NAME
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
return(cleaned_message)
}
main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)
# look at the representative comments for PC1 and PC2
top5 <- main_df %>%
arrange(desc(PC3)) %>%
slice(250:260) %>%
pull(cleaned_comment)
bottom5 <- main_df %>%
arrange(PC3) %>%
slice(250:260) %>%
pull(cleaned_comment)
cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)
cat("\nBottom 300:310 comment_text by PC2 score:\n")
print(bottom5)
comments_style <- main_df |>
ggplot( ggplot(
aes( aes(
x = PC4, x = PC1,
y = PC3, y = PC4,
fill = comment_type fill = comment_type
) )
) + ) +
facet_grid(~source, scales="fixed", facet_grid(~source, scales="fixed",
labeller = as_labeller(c( labeller = as_labeller(c(
"c1" = "VisualEditor (c1)", "c1" = "VisualEditor",
"c2" = "HTTPS-as-default (c2)", "c2" = "HTTPS-login",
"c3" = "HTTP-deprecation (c3)" "c3" = "HTTP-deprecation"
))) + ))) +
geom_point(shape = 21, alpha=0.3, size=2) + geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-50, 50) + xlim(-50, 50) +
@ -36,40 +71,54 @@ main_df |>
theme_minimal() + theme_minimal() +
theme(legend.position = "top") + theme(legend.position = "top") +
labs( labs(
title = "PCs for Task Comments by comment type and case", x = "Lengthy Discussion v. Brief Updates (PC1)",
x = "Casual v. Formal Updates (PC3)", y = "Technical Jargon v. Non-technical Observations (PC4)",
y = "Technical-matter v. Procedural Commentary (PC4)",
) )
ggsave(
filename = "121625_comments_style.png",
plot = comments_style,
width = 12, # inches
height = 8, # inches
dpi = 800 # high resolution
)
main_df |> adac_style <- main_df |>
filter(ADAC == 1) |> filter(ADAC == 1) |>
ggplot( ggplot(
aes( aes(
x = PC4, x = PC3,
y = PC3, y = PC4,
fill = as.factor(ADAC) fill = as.factor(isAuthorWMF)
) )
) + ) +
facet_grid(comment_type~source, facet_grid(~source,
labeller = as_labeller(c( labeller = as_labeller(c(
"c1" = "VisualEditor (c1)", "c1" = "VisualEditor",
"c2" = "HTTPS-as-default (c2)", "c2" = "HTTPS-login",
"c3" = "HTTP-deprecation (c3)", "c3" = "HTTP-deprecation",
"task_description" = "Task Description", "task_description" = "Task Description",
"task_subcomment" = "Follow-up Reply" "task_subcomment" = "Follow-up Reply"
))) + ))) +
geom_point(shape = 21, alpha=0.3, size=2) + geom_point(shape = 21, alpha=0.3, size=2) +
scale_fill_viridis_d( xlim(-50, 50) +
name = "Comment Author Affiliation", ylim(-50, 50) +
labels = c("Nonaffiliated", "WMF-affiliated"))+ scale_fill_viridis_d()+
theme_minimal() + theme_minimal() +
theme(legend.position = "top") + theme(legend.position = "top") +
labs( labs(
title = "PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)", x = "Expressive, first-person v. Dry, third-person (PC3)",
x = "Casual v. Formal Updates (PC3)", y = "Technical Jargon v. Non-technical Observations (PC4)",
y = "Technical-matter v. Procedural Commentary (PC4)",
) )
#"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)" #"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
ggsave(
filename = "121625_adac_affil_style.png",
plot = adac_style,
width = 12, # inches
height = 8, # inches
dpi = 800 # high resolution
)
main_df |> main_df |>
filter(comment_type=="task_subcomment") |> filter(comment_type=="task_subcomment") |>
@ -104,221 +153,3 @@ main_df <- main_df |>
comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+")) comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
) )
description_df <- main_df |>
filter(comment_type == "task_description")
replies_df <- main_df |>
filter(comment_type == "task_subcomment") |>
filter(isGerritBot != TRUE)
library(ggplot2)
ggplot(replies_df, aes(x = PC3, y = PC4, fill = isAuthorWMF)) +
facet_grid(ADAC~source, scales="fixed") +
geom_point(shape = 21, alpha=0.15, size=3) +
xlim(-50, 50) +
ylim(-50, 50) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "PCs for Task Comments (Faceted by source (column))",
x = "PC3",
y = "PC4",
)
replies_df |>
ggplot(aes(
x = as.factor(author_closer.y), # x-axis grouping
y = PC1.x,
fill = reso
)) +
ylim(-30, 30) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source.x, scales = "fixed") +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
description_df |>
ggplot(aes(
x = as.factor(author_closer), # x-axis grouping
y = PC4,
fill = resolution_outcome
)) +
facet_grid( ~ source, scales = "fixed") +
ylim(-40, 40) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)
main_df <- main_df |>
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
# Join main_df to neurobiber_description_pca_df
description_joined <- main_df |>
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
# Join main_df to neurobiber_subcomment_pca_df
subcomment_joined <- main_df |>
right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
total_joined <- main_df |>
right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
filter(comment_text != "nan") #TODO: look at this more in depth
preprocess_comment <- function(message) {
library(stringr)
comment_text <- message
# 1. replace code with CODE
# Inline code: `...`
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
# Block code: ```...```
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
# 2. replace quotes with QUOTE
lines <- unlist(strsplit(comment_text, "\n"))
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
comment_text <- paste(lines, collapse = "\n")
# 3. replace Gerrit URLs with GERRIT_URL
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
# replace URL with URL
url_pattern <- "https?://[^\\s]+"
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
# 4. replace @screenname with SCREEN_NAME
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
return(cleaned_message)
}
# Add comment_type column to each df
neurobiber_description_pca_df$comment_type <- "task_description"
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
#clean the messages
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
subcomment_joined <- subcomment_joined %>%
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
paste(neurobiber_description_pca_df$AuthorPHID,
neurobiber_description_pca_df$TaskPHID)))
# look at correlation between PC1, PC2, and different outcome variables
description_anova_results <- neurobiber_description_pca_df %>%
group_by(source) %>%
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
description_anova_results
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
group_by(source) %>%
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
discussion_anova_results
# look at the representative comments for PC1 and PC2
top5 <- total_joined %>%
arrange(desc(PC4)) %>%
slice(300:310) %>%
pull(cleaned_comment)
bottom5 <- total_joined %>%
arrange(PC4) %>%
slice(300:310) %>%
pull(cleaned_comment)
cat("Top 300:310 comment_text by PC2 score:\n")
print(top5)
cat("\nBottom 300:310 comment_text by PC2 score:\n")
print(bottom5)
library(scales)
library(ggplot2)
affiliationColors <-
setNames( c('#5da2d8', '#c7756a')
,c("False", "True"))
subcomment_joined_no_gerrit <- subcomment_joined |>
filter(isGerritBot != "TRUE") |>
left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
# geom_point(shape = 21, alpha=0.4, size=2) +
# geom_bin_2d() +
sampled_authors <- subcomment_joined_no_gerrit %>%
distinct(AuthorPHID) %>%
sample_n(100) %>%
pull(AuthorPHID)
# 2. Filter original data to just those authors
sub_sample <- subcomment_joined_no_gerrit %>%
filter(AuthorPHID %in% sampled_authors)
description_sampled_authors <- description_joined %>%
distinct(AuthorPHID) %>%
sample_n(8) %>%
pull(AuthorPHID)
# 2. Filter original data to just those authors
description_sub_sample <- description_joined %>%
filter(AuthorPHID %in% description_sampled_authors)
ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
facet_grid(source~phase, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-30, 30) +
ylim(-30, 30) +
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "PCs for Task Comments (Faceted by source and phase)",
x = "PC4",
y = "PC3",
)
priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
mutate(priority = factor(priority, levels = priority_order))
description_joined <- description_joined %>%
mutate(priority = factor(priority.y, levels = priority_order))
ggplot(total_joined, aes(
x = as.factor(comment_type), # x-axis grouping
y = PC3,
fill = isAuthorWMF
)) +
ylim(-30, 30) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() +
theme_minimal() +
labs(
title = "Boxplot of PC4",
x = "Comment_type",
y = "PC4",
fill = "isAuthorWMF?"
)