updating new analysis with re-labeled data, gerrit is out and bzimport is its own thing
This commit is contained in:
parent
df1dcf1224
commit
1584e2cd5f
|
Can't render this file because it is too large.
|
@ -82,19 +82,20 @@ main_df <- main_df |>
|
||||
)
|
||||
|
||||
#getting PC values (need todo after revised pass)
|
||||
pca_csv <- "~/analysis_data/121625_constituent_dfs/121525_total_pca_df.csv"
|
||||
pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv"
|
||||
pca_df <- read.csv(pca_csv, header = TRUE)
|
||||
length(unique(pca_df$id))
|
||||
pca_df <- pca_df |>
|
||||
select(starts_with("PC"),
|
||||
id)
|
||||
|
||||
#first_join <- main_df|>
|
||||
# left_join(
|
||||
# pca_df,
|
||||
# by = "id"
|
||||
# )
|
||||
first_join <- main_df|>
|
||||
left_join(
|
||||
pca_df,
|
||||
by = "id"
|
||||
)
|
||||
|
||||
length(unique(first_join$id))
|
||||
olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv"
|
||||
olmo_df <- read.csv(olmo_csv, header = TRUE)
|
||||
|
||||
@ -103,7 +104,7 @@ olmo_df <- olmo_df |>
|
||||
olmo_sentence_labels = sentence_categories)|>
|
||||
select(id, olmo_cleaned_sentences, olmo_sentence_labels)
|
||||
|
||||
second_join <- main_df|>
|
||||
second_join <- first_join |>
|
||||
left_join(
|
||||
olmo_df,
|
||||
by = "id"
|
||||
@ -163,4 +164,4 @@ unified_df <- unified_df |>
|
||||
gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+")
|
||||
)
|
||||
|
||||
write.csv(unified_df, "forPCA_121625_unified.csv", row.names = FALSE)
|
||||
write.csv(unified_df, "121625_unified.csv", row.names = FALSE)
|
||||
377739
analysis_data/121625_unified.csv
Normal file
377739
analysis_data/121625_unified.csv
Normal file
File diff suppressed because one or more lines are too long
|
Can't render this file because it is too large.
|
BIN
doc_plots/121625_dsl_coefs.png
Normal file
BIN
doc_plots/121625_dsl_coefs.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 560 KiB |
BIN
doc_plots/rq2_plots/121625_adac_affil_style.png
Normal file
BIN
doc_plots/rq2_plots/121625_adac_affil_style.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.1 MiB |
BIN
doc_plots/rq2_plots/121625_comments_style.png
Normal file
BIN
doc_plots/rq2_plots/121625_comments_style.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 11 MiB |
BIN
doc_plots/rq2_plots/121625_machine_label_comparison.png
Normal file
BIN
doc_plots/rq2_plots/121625_machine_label_comparison.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 484 KiB |
3130
dsl/121625_DSL_frame.csv
Normal file
3130
dsl/121625_DSL_frame.csv
Normal file
File diff suppressed because it is too large
Load Diff
100
dsl/121625_final_dsl.R
Normal file
100
dsl/121625_final_dsl.R
Normal file
@ -0,0 +1,100 @@
|
||||
library(tidyverse)
|
||||
library(dsl)
|
||||
|
||||
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
|
||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||
|
||||
dsl_df <- dsl_df |>
|
||||
dplyr::mutate(ttr_days = TTR_hours / 24) |>
|
||||
dplyr::mutate(task_resolution = dsl_score)
|
||||
|
||||
dev_model <- dsl(
|
||||
model = "logit",
|
||||
formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
|
||||
+ median_PC4_adac + median_PC3_adac + n_comments_before
|
||||
+ median_gerrit_reviewers + week_index + as.factor(isAuthorWMF) * as.factor(source),
|
||||
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
|
||||
prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
|
||||
sample_prob = "sampling_prob",
|
||||
cluster="source",
|
||||
cross_fit = 3,
|
||||
sample_split = 20,
|
||||
data=dsl_df
|
||||
)
|
||||
summary(dev_model)
|
||||
#saveRDS(dev_model, "121625_logit_dsl.RDS")
|
||||
dev_model <- readRDS("dsl/121625_logit_dsl.RDS")
|
||||
library(broom)
|
||||
library(dplyr)
|
||||
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
|
||||
res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
|
||||
terms <- row.names(res)
|
||||
cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
|
||||
if (conf.int) {
|
||||
cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
|
||||
}
|
||||
out <- as.list(res)[cols]
|
||||
names(out) <- names(cols)
|
||||
out <- as_tibble(as.data.frame(out))
|
||||
out <- dplyr::bind_cols(term = terms, out)
|
||||
if (exponentiate)
|
||||
out <- broom:::exponentiate(out)
|
||||
return(out)
|
||||
}
|
||||
coef_df <- tidy.dsl(dev_model)
|
||||
coef_df <- coef_df |>
|
||||
mutate(
|
||||
term = recode(term,
|
||||
"week_index" = "Weeks from deployment",
|
||||
"(Intercept)" = "Intercept",
|
||||
"n_comments_before" = "# of comments prior to resolution",
|
||||
"median_PC4_adac" = "Median Author PC4 Pre-resolution",
|
||||
"median_PC3_adac" = "Median Author PC3 Pre-resolution",
|
||||
"median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
|
||||
"human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
|
||||
"human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'",
|
||||
"human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'",
|
||||
"as.factor(source)c3" = "HTTP-deprecation (factor)",
|
||||
"as.factor(source)c2" = "HTTPS-login (factor)",
|
||||
"as.factor(isAuthorWMF)TRUE" = "WMF-affiliated Author (factor)",
|
||||
"as.factor(isAuthorWMF)FALSE" = "Nonaffiliated Author (factor)",
|
||||
"as.factor(isAuthorWMF)FALSE:as.factor(source)c2" = "Nonaffiliated Author:HTTPS-login",
|
||||
"as.factor(isAuthorWMF)FALSE:as.factor(source)c3" = "Nonaffiliated Author:HTTP-deprecation",
|
||||
"as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliated Author:HTTPS-login",
|
||||
"as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliated Author:HTTP-deprecation",
|
||||
),
|
||||
term = factor(term, levels = rev(c(
|
||||
"Intercept",
|
||||
"% of sentences discussing 'Existent Problems'",
|
||||
"% of sentences discussing 'Solutions'",
|
||||
"% of sentences discussing 'Record Keeping'",
|
||||
"Median Author PC4 Pre-resolution",
|
||||
"Median Author PC3 Pre-resolution",
|
||||
"# of comments prior to resolution",
|
||||
"Median # of Code Reviewers (Gerrit)",
|
||||
"Weeks from deployment",
|
||||
"HTTPS-login (factor)",
|
||||
"HTTP-deprecation (factor)",
|
||||
"Nonaffiliated Author (factor)",
|
||||
"WMF-affiliated Author (factor)",
|
||||
"Nonaffiliated Author:HTTPS-login",
|
||||
"WMF-affiliated Author:HTTPS-login",
|
||||
"Nonaffiliated Author:HTTP-deprecation",
|
||||
"WMF-affiliated Author:HTTP-deprecation"
|
||||
)))
|
||||
)
|
||||
dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
|
||||
geom_point(size = 1) +
|
||||
geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
|
||||
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
|
||||
labs(x = "Log-odds Coefficient Estimate",
|
||||
y = "Variable") +
|
||||
theme_minimal()
|
||||
dsl_coefs
|
||||
ggsave(
|
||||
filename = "121625_dsl_coefs.png",
|
||||
plot = dsl_coefs,
|
||||
width = 6, # inches
|
||||
height = 6, # inches
|
||||
dpi = 800 # high resolution
|
||||
)
|
||||
BIN
dsl/121625_logit_dsl.RDS
Normal file
BIN
dsl/121625_logit_dsl.RDS
Normal file
Binary file not shown.
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
|
Can't render this file because it is too large.
|
10
dsl/dsl.R
10
dsl/dsl.R
@ -1,7 +1,7 @@
|
||||
library(tidyverse)
|
||||
library(dsl)
|
||||
|
||||
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
|
||||
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
|
||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||
|
||||
dsl_df <- dsl_df |>
|
||||
@ -69,7 +69,7 @@ summary(felm_model)
|
||||
dev_model <- dsl(
|
||||
model = "logit",
|
||||
formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
|
||||
+ median_PC4_adac + median_PC3_adac + n_comments_before
|
||||
+ median_PC4_adac + median_PC3_adac + median_PC1_adac + n_comments_before
|
||||
+ median_gerrit_reviewers + median_gerrit_loc_delta
|
||||
+ week_index + as.factor(isAuthorWMF) * as.factor(source),
|
||||
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
|
||||
@ -80,9 +80,9 @@ dev_model <- dsl(
|
||||
sample_split = 20,
|
||||
data=dsl_df
|
||||
)
|
||||
#summary(dev_model)
|
||||
summary(dev_model)
|
||||
#saveRDS(dev_model, "120725_logit_dsl.RDS")
|
||||
dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
|
||||
#dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
|
||||
library(broom)
|
||||
library(dplyr)
|
||||
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
|
||||
@ -109,6 +109,7 @@ coef_df <- coef_df |>
|
||||
"n_comments_before" = "# of comments prior to resolution",
|
||||
"median_PC4_adac" = "Median Author PC4 Pre-resolution",
|
||||
"median_PC3_adac" = "Median Author PC3 Pre-resolution",
|
||||
"median_PC1_adac" = "Median Author PC1 Pre-resolution",
|
||||
"median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
|
||||
"median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)",
|
||||
"human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
|
||||
@ -127,6 +128,7 @@ coef_df <- coef_df |>
|
||||
"% of sentences discussing 'Record Keeping'",
|
||||
"Median Author PC4 Pre-resolution",
|
||||
"Median Author PC3 Pre-resolution",
|
||||
"Median Author PC1 Pre-resolution",
|
||||
"# of comments prior to resolution",
|
||||
"Median # of Code Reviewers (Gerrit)",
|
||||
"Median LoC Changed (Gerrit)",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
library(tidyverse)
|
||||
|
||||
unified_csv <-"~/analysis_data/120725_unified.csv"
|
||||
unified_csv <-"~/analysis_data/121625_unified.csv"
|
||||
unified_df <- read.csv(unified_csv, header = TRUE)
|
||||
|
||||
# 1. aggregate to the task level
|
||||
@ -9,7 +9,7 @@ unified_df <- read.csv(unified_csv, header = TRUE)
|
||||
# 1c.
|
||||
valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
|
||||
'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
|
||||
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
|
||||
'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', 'FUTURE PLANS',
|
||||
'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
|
||||
'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
|
||||
'SOCIAL CONVERSATION')
|
||||
@ -204,15 +204,15 @@ task_level_variables <- unified_df |>
|
||||
group_by(TaskPHID) |>
|
||||
summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
|
||||
median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
|
||||
median_PC3 = median(PC3),
|
||||
median_PC3_adac = median(PC3[ADAC==1]),
|
||||
median_PC3_no_adac = median(PC3[ADAC==0]),
|
||||
median_PC1 = median(PC1),
|
||||
median_PC1_adac = median(PC1[ADAC==1]),
|
||||
median_PC1_no_adac = median(PC1[ADAC==0]),
|
||||
median_PC4 = median(PC4),
|
||||
median_PC4_adac = median(PC4[ADAC==1]),
|
||||
median_PC4_no_adac = median(PC4[ADAC==0]),
|
||||
median_PC3 = median(PC3, na.rm = TRUE),
|
||||
median_PC3_adac = median(PC3[ADAC==1], na.rm = TRUE),
|
||||
median_PC3_no_adac = median(PC3[ADAC==0], na.rm = TRUE),
|
||||
median_PC1 = median(PC1, na.rm = TRUE),
|
||||
median_PC1_adac = median(PC1[ADAC==1], na.rm = TRUE),
|
||||
median_PC1_no_adac = median(PC1[ADAC==0], na.rm = TRUE),
|
||||
median_PC4 = median(PC4, na.rm = TRUE),
|
||||
median_PC4_adac = median(PC4[ADAC==1], na.rm = TRUE),
|
||||
median_PC4_no_adac = median(PC4[ADAC==0], na.rm = TRUE),
|
||||
n_comments = sum(!is.na(id)),
|
||||
n_comments_before = sum(before_close)
|
||||
)
|
||||
@ -221,7 +221,7 @@ descriptions <- unified_df |>
|
||||
filter(comment_type == "task_description")|>
|
||||
select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF,
|
||||
source, phase, week_index, author_closer, resolution_outcome, priority,
|
||||
gerrit_repo, task_status)
|
||||
gerrit_repo, status)
|
||||
|
||||
task_level_variables <- task_level_variables |>
|
||||
left_join(
|
||||
@ -242,7 +242,7 @@ task_level_variables <- task_level_variables |>
|
||||
)
|
||||
# 2. assign sampling prob for different tasks
|
||||
# need to ID those selected in the first round of sampling that were removed for the second round of sampling
|
||||
large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
|
||||
large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv"
|
||||
large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
|
||||
first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
|
||||
# refer to DSL specification sheet
|
||||
@ -258,37 +258,10 @@ task_level_variables <- task_level_variables |>
|
||||
) |>
|
||||
select(-isFirstSample) |>
|
||||
mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
|
||||
mutate(TTR = (date_closed - date_created)/3600)
|
||||
mutate(TTR_hours = (date_closed - date_created)/3600)
|
||||
# 3. check validity of different aggregate variables
|
||||
mean(task_level_variables$sampling_prob)
|
||||
table(task_level_variables$resolution_outcome)
|
||||
# look at bivariate plots
|
||||
ggplot(task_level_variables, aes(
|
||||
x = as.factor(source),
|
||||
y = week_index,
|
||||
fill = resolution_outcome
|
||||
)) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of week_index against Resolution Outcome",
|
||||
x = "Case",
|
||||
y = "Week Index",
|
||||
fill = "Resolution Outcome"
|
||||
)
|
||||
|
||||
|
||||
ggplot(task_level_variables,
|
||||
aes(
|
||||
x=as.factor(source),
|
||||
y=olmo_RK_prop,
|
||||
fill=as.factor(source)
|
||||
)) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal()
|
||||
|
||||
# 4. save
|
||||
write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE)
|
||||
write.csv(task_level_variables, "121625_DSL_frame.csv", row.names = FALSE)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
library(tidyverse)
|
||||
#library(dsl)
|
||||
library(dplyr)
|
||||
dsl_csv <-"~/dsl/120725_DSL_frame.csv"
|
||||
dsl_csv <-"~/dsl/126725_DSL_frame.csv"
|
||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||
|
||||
dsl_df <- dsl_df |>
|
||||
|
||||
53
dsl/rq2_plot.R
Normal file
53
dsl/rq2_plot.R
Normal file
@ -0,0 +1,53 @@
|
||||
library(tidyverse)
|
||||
#library(dsl)
|
||||
library(dplyr)
|
||||
dsl_csv <-"~/dsl/121625_DSL_frame.csv"
|
||||
dsl_df <- read.csv(dsl_csv, header = TRUE)
|
||||
|
||||
dsl_df <- dsl_df |>
|
||||
filter(isAuthorWMF != "BzImport")
|
||||
|
||||
dsl_df_long <- dsl_df %>%
|
||||
pivot_longer(
|
||||
cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
|
||||
names_to = "tag",
|
||||
values_to = "proportion"
|
||||
) %>%
|
||||
mutate(tag = gsub("olmo_|_prop_adac", "", tag),
|
||||
tag = case_when(
|
||||
tag == "EP" ~ "Existent Problem",
|
||||
tag == "RK" ~ "Record Keeping",
|
||||
tag =="TSOL" ~ "Solutions"
|
||||
))
|
||||
|
||||
olmo_comparison <- ggplot(
|
||||
dsl_df_long,
|
||||
aes(
|
||||
x = tag,
|
||||
y = proportion,
|
||||
fill = isAuthorWMF,
|
||||
)
|
||||
) +
|
||||
facet_grid(source ~ .,
|
||||
scales = "free_y",
|
||||
labeller = labeller(source = c("c1" = "VisualEditor",
|
||||
"c2" = "HTTPS-login",
|
||||
"c3" = "HTTP-deprecation"))) +
|
||||
geom_boxplot() +
|
||||
theme_minimal() +
|
||||
scale_fill_viridis_d() +
|
||||
labs(
|
||||
x = "Tag",
|
||||
y = "% of sentences tagged",
|
||||
color = "Is Author WMF?",
|
||||
fill = "Is Author WMF?"
|
||||
) +
|
||||
theme(legend.position = "top")
|
||||
olmo_comparison
|
||||
ggsave(
|
||||
filename = "121625_machine_label_comparison.png",
|
||||
plot = olmo_comparison,
|
||||
width = 12, # inches
|
||||
height = 6, # inches
|
||||
dpi = 800 # high resolution
|
||||
)
|
||||
@ -1,17 +0,0 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:46483 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: NeI7LSiR2rI9GCHZLNWB
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 31856137
|
||||
@ -1,30 +1,65 @@
|
||||
library(tidyverse)
|
||||
library(dplyr)
|
||||
#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
||||
#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
|
||||
#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
||||
#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
|
||||
#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
|
||||
#pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||
|
||||
main_csv <- "~/analysis_data/120725_unified.csv"
|
||||
main_csv <- "~/analysis_data/121625_unified.csv"
|
||||
main_df <- read.csv(main_csv , header = TRUE)
|
||||
length(unique(main_df$id))
|
||||
main_df |>
|
||||
|
||||
preprocess_comment <- function(message) {
|
||||
library(stringr)
|
||||
comment_text <- message
|
||||
# 1. replace code with CODE
|
||||
# Inline code: `...`
|
||||
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
|
||||
# Block code: ```...```
|
||||
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
|
||||
# 2. replace quotes with QUOTE
|
||||
lines <- unlist(strsplit(comment_text, "\n"))
|
||||
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
|
||||
comment_text <- paste(lines, collapse = "\n")
|
||||
# 3. replace Gerrit URLs with GERRIT_URL
|
||||
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
|
||||
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
|
||||
# replace URL with URL
|
||||
url_pattern <- "https?://[^\\s]+"
|
||||
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
|
||||
# 4. replace @screenname with SCREEN_NAME
|
||||
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
|
||||
return(cleaned_message)
|
||||
}
|
||||
main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)
|
||||
|
||||
# look at the representative comments for PC1 and PC2
|
||||
top5 <- main_df %>%
|
||||
arrange(desc(PC3)) %>%
|
||||
slice(250:260) %>%
|
||||
pull(cleaned_comment)
|
||||
|
||||
bottom5 <- main_df %>%
|
||||
arrange(PC3) %>%
|
||||
slice(250:260) %>%
|
||||
pull(cleaned_comment)
|
||||
|
||||
cat("Top 300:310 comment_text by PC2 score:\n")
|
||||
print(top5)
|
||||
|
||||
cat("\nBottom 300:310 comment_text by PC2 score:\n")
|
||||
print(bottom5)
|
||||
|
||||
|
||||
comments_style <- main_df |>
|
||||
ggplot(
|
||||
aes(
|
||||
x = PC4,
|
||||
y = PC3,
|
||||
x = PC1,
|
||||
y = PC4,
|
||||
fill = comment_type
|
||||
)
|
||||
) +
|
||||
facet_grid(~source, scales="fixed",
|
||||
labeller = as_labeller(c(
|
||||
"c1" = "VisualEditor (c1)",
|
||||
"c2" = "HTTPS-as-default (c2)",
|
||||
"c3" = "HTTP-deprecation (c3)"
|
||||
"c1" = "VisualEditor",
|
||||
"c2" = "HTTPS-login",
|
||||
"c3" = "HTTP-deprecation"
|
||||
))) +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
xlim(-50, 50) +
|
||||
@ -36,40 +71,54 @@ main_df |>
|
||||
theme_minimal() +
|
||||
theme(legend.position = "top") +
|
||||
labs(
|
||||
title = "PCs for Task Comments by comment type and case",
|
||||
x = "Casual v. Formal Updates (PC3)",
|
||||
y = "Technical-matter v. Procedural Commentary (PC4)",
|
||||
x = "Lengthy Discussion v. Brief Updates (PC1)",
|
||||
y = "Technical Jargon v. Non-technical Observations (PC4)",
|
||||
)
|
||||
ggsave(
|
||||
filename = "121625_comments_style.png",
|
||||
plot = comments_style,
|
||||
width = 12, # inches
|
||||
height = 8, # inches
|
||||
dpi = 800 # high resolution
|
||||
)
|
||||
|
||||
main_df |>
|
||||
adac_style <- main_df |>
|
||||
filter(ADAC == 1) |>
|
||||
ggplot(
|
||||
aes(
|
||||
x = PC4,
|
||||
y = PC3,
|
||||
fill = as.factor(ADAC)
|
||||
x = PC3,
|
||||
y = PC4,
|
||||
fill = as.factor(isAuthorWMF)
|
||||
)
|
||||
) +
|
||||
facet_grid(comment_type~source,
|
||||
facet_grid(~source,
|
||||
labeller = as_labeller(c(
|
||||
"c1" = "VisualEditor (c1)",
|
||||
"c2" = "HTTPS-as-default (c2)",
|
||||
"c3" = "HTTP-deprecation (c3)",
|
||||
"c1" = "VisualEditor",
|
||||
"c2" = "HTTPS-login",
|
||||
"c3" = "HTTP-deprecation",
|
||||
"task_description" = "Task Description",
|
||||
"task_subcomment" = "Follow-up Reply"
|
||||
))) +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
scale_fill_viridis_d(
|
||||
name = "Comment Author Affiliation",
|
||||
labels = c("Nonaffiliated", "WMF-affiliated"))+
|
||||
xlim(-50, 50) +
|
||||
ylim(-50, 50) +
|
||||
scale_fill_viridis_d()+
|
||||
theme_minimal() +
|
||||
theme(legend.position = "top") +
|
||||
labs(
|
||||
title = "PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)",
|
||||
x = "Casual v. Formal Updates (PC3)",
|
||||
y = "Technical-matter v. Procedural Commentary (PC4)",
|
||||
x = "Expressive, first-person v. Dry, third-person (PC3)",
|
||||
y = "Technical Jargon v. Non-technical Observations (PC4)",
|
||||
)
|
||||
#"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
|
||||
ggsave(
|
||||
filename = "121625_adac_affil_style.png",
|
||||
plot = adac_style,
|
||||
width = 12, # inches
|
||||
height = 8, # inches
|
||||
dpi = 800 # high resolution
|
||||
)
|
||||
|
||||
|
||||
|
||||
main_df |>
|
||||
filter(comment_type=="task_subcomment") |>
|
||||
@ -104,221 +153,3 @@ main_df <- main_df |>
|
||||
comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
description_df <- main_df |>
|
||||
filter(comment_type == "task_description")
|
||||
|
||||
replies_df <- main_df |>
|
||||
filter(comment_type == "task_subcomment") |>
|
||||
filter(isGerritBot != TRUE)
|
||||
|
||||
library(ggplot2)
|
||||
ggplot(replies_df, aes(x = PC3, y = PC4, fill = isAuthorWMF)) +
|
||||
facet_grid(ADAC~source, scales="fixed") +
|
||||
geom_point(shape = 21, alpha=0.15, size=3) +
|
||||
xlim(-50, 50) +
|
||||
ylim(-50, 50) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "PCs for Task Comments (Faceted by source (column))",
|
||||
x = "PC3",
|
||||
y = "PC4",
|
||||
)
|
||||
|
||||
|
||||
replies_df |>
|
||||
ggplot(aes(
|
||||
x = as.factor(author_closer.y), # x-axis grouping
|
||||
y = PC1.x,
|
||||
fill = reso
|
||||
)) +
|
||||
ylim(-30, 30) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source.x, scales = "fixed") +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of PC4",
|
||||
x = "Comment_type",
|
||||
y = "PC4",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
description_df |>
|
||||
ggplot(aes(
|
||||
x = as.factor(author_closer), # x-axis grouping
|
||||
y = PC4,
|
||||
fill = resolution_outcome
|
||||
)) +
|
||||
facet_grid( ~ source, scales = "fixed") +
|
||||
ylim(-40, 40) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of PC4",
|
||||
x = "Comment_type",
|
||||
y = "PC4",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
|
||||
main_df <- main_df |>
|
||||
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
|
||||
# Join main_df to neurobiber_description_pca_df
|
||||
description_joined <- main_df |>
|
||||
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
||||
filter(comment_text != "nan") #TODO: look at this more in depth
|
||||
|
||||
# Join main_df to neurobiber_subcomment_pca_df
|
||||
subcomment_joined <- main_df |>
|
||||
right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
||||
filter(comment_text != "nan") #TODO: look at this more in depth
|
||||
|
||||
total_joined <- main_df |>
|
||||
right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
||||
filter(comment_text != "nan") #TODO: look at this more in depth
|
||||
|
||||
|
||||
preprocess_comment <- function(message) {
|
||||
library(stringr)
|
||||
comment_text <- message
|
||||
# 1. replace code with CODE
|
||||
# Inline code: `...`
|
||||
comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
|
||||
# Block code: ```...```
|
||||
comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
|
||||
# 2. replace quotes with QUOTE
|
||||
lines <- unlist(strsplit(comment_text, "\n"))
|
||||
lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
|
||||
comment_text <- paste(lines, collapse = "\n")
|
||||
# 3. replace Gerrit URLs with GERRIT_URL
|
||||
gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
|
||||
comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
|
||||
# replace URL with URL
|
||||
url_pattern <- "https?://[^\\s]+"
|
||||
comment_text <- str_replace_all(comment_text, url_pattern, "URL")
|
||||
# 4. replace @screenname with SCREEN_NAME
|
||||
cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
|
||||
return(cleaned_message)
|
||||
}
|
||||
|
||||
# Add comment_type column to each df
|
||||
neurobiber_description_pca_df$comment_type <- "task_description"
|
||||
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
|
||||
|
||||
#clean the messages
|
||||
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
|
||||
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
|
||||
total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
|
||||
|
||||
subcomment_joined <- subcomment_joined %>%
|
||||
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
|
||||
paste(neurobiber_description_pca_df$AuthorPHID,
|
||||
neurobiber_description_pca_df$TaskPHID)))
|
||||
|
||||
# look at correlation between PC1, PC2, and different outcome variables
|
||||
description_anova_results <- neurobiber_description_pca_df %>%
|
||||
group_by(source) %>%
|
||||
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
||||
description_anova_results
|
||||
|
||||
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
|
||||
group_by(source) %>%
|
||||
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
||||
discussion_anova_results
|
||||
|
||||
# look at the representative comments for PC1 and PC2
|
||||
top5 <- total_joined %>%
|
||||
arrange(desc(PC4)) %>%
|
||||
slice(300:310) %>%
|
||||
pull(cleaned_comment)
|
||||
|
||||
bottom5 <- total_joined %>%
|
||||
arrange(PC4) %>%
|
||||
slice(300:310) %>%
|
||||
pull(cleaned_comment)
|
||||
|
||||
cat("Top 300:310 comment_text by PC2 score:\n")
|
||||
print(top5)
|
||||
|
||||
cat("\nBottom 300:310 comment_text by PC2 score:\n")
|
||||
print(bottom5)
|
||||
|
||||
|
||||
library(scales)
|
||||
library(ggplot2)
|
||||
|
||||
|
||||
affiliationColors <-
|
||||
setNames( c('#5da2d8', '#c7756a')
|
||||
,c("False", "True"))
|
||||
|
||||
subcomment_joined_no_gerrit <- subcomment_joined |>
|
||||
filter(isGerritBot != "TRUE") |>
|
||||
left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
|
||||
|
||||
|
||||
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
||||
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
||||
# geom_point(shape = 21, alpha=0.4, size=2) +
|
||||
# geom_bin_2d() +
|
||||
|
||||
sampled_authors <- subcomment_joined_no_gerrit %>%
|
||||
distinct(AuthorPHID) %>%
|
||||
sample_n(100) %>%
|
||||
pull(AuthorPHID)
|
||||
|
||||
# 2. Filter original data to just those authors
|
||||
sub_sample <- subcomment_joined_no_gerrit %>%
|
||||
filter(AuthorPHID %in% sampled_authors)
|
||||
|
||||
description_sampled_authors <- description_joined %>%
|
||||
distinct(AuthorPHID) %>%
|
||||
sample_n(8) %>%
|
||||
pull(AuthorPHID)
|
||||
|
||||
# 2. Filter original data to just those authors
|
||||
description_sub_sample <- description_joined %>%
|
||||
filter(AuthorPHID %in% description_sampled_authors)
|
||||
|
||||
ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
|
||||
facet_grid(source~phase, scales="fixed") +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
xlim(-30, 30) +
|
||||
ylim(-30, 30) +
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "PCs for Task Comments (Faceted by source and phase)",
|
||||
x = "PC4",
|
||||
y = "PC3",
|
||||
)
|
||||
|
||||
priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
|
||||
|
||||
subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
|
||||
mutate(priority = factor(priority, levels = priority_order))
|
||||
|
||||
description_joined <- description_joined %>%
|
||||
mutate(priority = factor(priority.y, levels = priority_order))
|
||||
|
||||
ggplot(total_joined, aes(
|
||||
x = as.factor(comment_type), # x-axis grouping
|
||||
y = PC3,
|
||||
fill = isAuthorWMF
|
||||
)) +
|
||||
ylim(-30, 30) +
|
||||
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||
scale_fill_viridis_d() +
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "Boxplot of PC4",
|
||||
x = "Comment_type",
|
||||
y = "PC4",
|
||||
fill = "isAuthorWMF?"
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user