updating new analysis with re-labeled data, gerrit is out and bzimport is its own thing

2025-12-16 17:55:51 -08:00 · 2025-12-16 17:55:51 -08:00 · 1584e2cd5f
commit 1584e2cd5f
parent df1dcf1224
21 changed files with 381134 additions and 322 deletions
--- a/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv
+++ b/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv
--- a/analysis_data/121625_data_unification.R
+++ b/analysis_data/121625_data_unification.R
@ -82,19 +82,20 @@ main_df <- main_df |>
  )
 #getting PC values (need todo after revised pass)
-pca_csv <- "~/analysis_data/121625_constituent_dfs/121525_total_pca_df.csv"
+pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv"
 pca_df <- read.csv(pca_csv, header = TRUE) 
 length(unique(pca_df$id))
 pca_df <- pca_df |>
  select(starts_with("PC"),
         id)
-#first_join <- main_df|>
+first_join <- main_df|>
-#  left_join(
+  left_join(
-#    pca_df,
+    pca_df,
-#    by = "id"
+    by = "id"
-#  )
+  )
 length(unique(first_join$id))
 olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv"
 olmo_df <- read.csv(olmo_csv, header = TRUE) 
@ -103,7 +104,7 @@ olmo_df <- olmo_df |>
         olmo_sentence_labels = sentence_categories)|>
  select(id, olmo_cleaned_sentences, olmo_sentence_labels)
-second_join <- main_df|>
+second_join <- first_join |>
  left_join(
    olmo_df,
    by = "id"
@ -163,4 +164,4 @@ unified_df <- unified_df |>
    gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+")
  )
-write.csv(unified_df, "forPCA_121625_unified.csv", row.names = FALSE)
+write.csv(unified_df, "121625_unified.csv", row.names = FALSE)
--- a/analysis_data/121625_unified.csv
+++ b/analysis_data/121625_unified.csv
--- a/analysis_data/stale_unifieds/forPCA_121625_unified.csv
+++ b/analysis_data/stale_unifieds/forPCA_121625_unified.csv
--- a/doc_plots/121625_dsl_coefs.png
+++ b/doc_plots/121625_dsl_coefs.png
--- a/doc_plots/rq2_plots/121625_adac_affil_style.png
+++ b/doc_plots/rq2_plots/121625_adac_affil_style.png
--- a/doc_plots/rq2_plots/121625_comments_style.png
+++ b/doc_plots/rq2_plots/121625_comments_style.png
--- a/doc_plots/rq2_plots/121625_machine_label_comparison.png
+++ b/doc_plots/rq2_plots/121625_machine_label_comparison.png
--- a/dsl/121625_DSL_frame.csv
+++ b/dsl/121625_DSL_frame.csv
--- a/dsl/121625_final_dsl.R
+++ b/dsl/121625_final_dsl.R
@ -0,0 +1,100 @@
 library(tidyverse)
 library(dsl)
 dsl_csv <-"~/dsl/121625_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 dsl_df <- dsl_df |>
  dplyr::mutate(ttr_days = TTR_hours / 24) |>
  dplyr::mutate(task_resolution = dsl_score)
 dev_model <- dsl(
  model = "logit", 
  formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac 
    + median_PC4_adac + median_PC3_adac + n_comments_before 
    + median_gerrit_reviewers + week_index + as.factor(isAuthorWMF) * as.factor(source),
  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
  prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
  sample_prob = "sampling_prob", 
  cluster="source",
  cross_fit = 3,
  sample_split = 20,
  data=dsl_df
 )
 summary(dev_model)
 #saveRDS(dev_model, "121625_logit_dsl.RDS")
 dev_model <- readRDS("dsl/121625_logit_dsl.RDS")
 library(broom)
 library(dplyr)
 tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
  res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
  terms <- row.names(res)
  cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
  if (conf.int) {
    cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
  }
  out <- as.list(res)[cols]
  names(out) <- names(cols)
  out <- as_tibble(as.data.frame(out))
  out <- dplyr::bind_cols(term = terms, out)
  if (exponentiate)
    out <- broom:::exponentiate(out)
  return(out)
 }
 coef_df <- tidy.dsl(dev_model)
 coef_df <- coef_df |>
  mutate(
    term = recode(term,
                  "week_index" = "Weeks from deployment",
                  "(Intercept)" = "Intercept",
                  "n_comments_before" = "# of comments prior to resolution",
                  "median_PC4_adac" = "Median Author PC4 Pre-resolution",
                  "median_PC3_adac" = "Median Author PC3 Pre-resolution",
                  "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
                  "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
                  "human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'",
                  "human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'",
                  "as.factor(source)c3" = "HTTP-deprecation (factor)",
                  "as.factor(source)c2" = "HTTPS-login (factor)",
                  "as.factor(isAuthorWMF)TRUE" = "WMF-affiliated Author (factor)",
                  "as.factor(isAuthorWMF)FALSE" = "Nonaffiliated Author (factor)",
                  "as.factor(isAuthorWMF)FALSE:as.factor(source)c2" = "Nonaffiliated Author:HTTPS-login",
                  "as.factor(isAuthorWMF)FALSE:as.factor(source)c3" = "Nonaffiliated Author:HTTP-deprecation",
                  "as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliated Author:HTTPS-login",
                  "as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliated Author:HTTP-deprecation",
                  ),
    term = factor(term, levels = rev(c(
      "Intercept",   
      "% of sentences discussing 'Existent Problems'",        
      "% of sentences discussing 'Solutions'",
      "% of sentences discussing 'Record Keeping'",
      "Median Author PC4 Pre-resolution",
      "Median Author PC3 Pre-resolution",
      "# of comments prior to resolution",
      "Median # of Code Reviewers (Gerrit)",
      "Weeks from deployment",
      "HTTPS-login (factor)",
      "HTTP-deprecation (factor)",
      "Nonaffiliated Author (factor)",
      "WMF-affiliated Author (factor)",
      "Nonaffiliated Author:HTTPS-login",
      "WMF-affiliated Author:HTTPS-login",
      "Nonaffiliated Author:HTTP-deprecation",
      "WMF-affiliated Author:HTTP-deprecation"
    )))
  )
 dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
  geom_point(size = 1) +
  geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Log-odds Coefficient Estimate",
       y = "Variable") +
  theme_minimal()
 dsl_coefs
 ggsave(
  filename = "121625_dsl_coefs.png",
  plot = dsl_coefs,
  width = 6,    # inches
  height = 6,   # inches
  dpi = 800     # high resolution
 )
--- a/dsl/121625_logit_dsl.RDS
+++ b/dsl/121625_logit_dsl.RDS
--- a/dsl/archived_dsl_data/110925_DSL_df_adac.csv
+++ b/dsl/archived_dsl_data/110925_DSL_df_adac.csv
--- a/dsl/archived_dsl_data/111725_DSL_frame.csv
+++ b/dsl/archived_dsl_data/111725_DSL_frame.csv
--- a/dsl/archived_dsl_data/120725_DSL_frame.csv
+++ b/dsl/archived_dsl_data/120725_DSL_frame.csv
--- a/dsl/archived_dsl_data/120725_logit_dsl.RDS
+++ b/dsl/archived_dsl_data/120725_logit_dsl.RDS
--- a/dsl/dsl.R
+++ b/dsl/dsl.R
@ -1,7 +1,7 @@
 library(tidyverse)
 library(dsl)
-dsl_csv <-"~/dsl/120725_DSL_frame.csv"
+dsl_csv <-"~/dsl/121625_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 dsl_df <- dsl_df |>
@ -69,7 +69,7 @@ summary(felm_model)
 dev_model <- dsl(
  model = "logit", 
  formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac 
-    + median_PC4_adac + median_PC3_adac + n_comments_before 
+    + median_PC4_adac + median_PC3_adac + median_PC1_adac + n_comments_before 
    + median_gerrit_reviewers + median_gerrit_loc_delta
    + week_index + as.factor(isAuthorWMF) * as.factor(source),
  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
@ -80,9 +80,9 @@ dev_model <- dsl(
  sample_split = 20,
  data=dsl_df
 )
-#summary(dev_model)
+summary(dev_model)
 #saveRDS(dev_model, "120725_logit_dsl.RDS")
-dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
+#dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
 library(broom)
 library(dplyr)
 tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
@ -109,6 +109,7 @@ coef_df <- coef_df |>
                  "n_comments_before" = "# of comments prior to resolution",
                  "median_PC4_adac" = "Median Author PC4 Pre-resolution",
                  "median_PC3_adac" = "Median Author PC3 Pre-resolution",
                  "median_PC1_adac" = "Median Author PC1 Pre-resolution",
                  "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
                  "median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)",
                  "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
@ -127,6 +128,7 @@ coef_df <- coef_df |>
      "% of sentences discussing 'Record Keeping'",
      "Median Author PC4 Pre-resolution",
      "Median Author PC3 Pre-resolution",
      "Median Author PC1 Pre-resolution",
      "# of comments prior to resolution",
      "Median # of Code Reviewers (Gerrit)",
      "Median LoC Changed (Gerrit)",
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -1,6 +1,6 @@
 library(tidyverse)
-unified_csv <-"~/analysis_data/120725_unified.csv"
+unified_csv <-"~/analysis_data/121625_unified.csv"
 unified_df <- read.csv(unified_csv, header = TRUE) 
 # 1. aggregate to the task level 
@ -9,7 +9,7 @@ unified_df <- read.csv(unified_csv, header = TRUE)
 #   1c. 
 valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
-                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
+                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', 'FUTURE PLANS',
                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')
@ -204,15 +204,15 @@ task_level_variables <- unified_df |>
  group_by(TaskPHID) |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
-            median_PC3 = median(PC3),
+            median_PC3 = median(PC3, na.rm = TRUE),
-            median_PC3_adac = median(PC3[ADAC==1]),
+            median_PC3_adac = median(PC3[ADAC==1], na.rm = TRUE),
-            median_PC3_no_adac = median(PC3[ADAC==0]),
+            median_PC3_no_adac = median(PC3[ADAC==0], na.rm = TRUE),
-            median_PC1 = median(PC1),
+            median_PC1 = median(PC1, na.rm = TRUE),
-            median_PC1_adac = median(PC1[ADAC==1]),
+            median_PC1_adac = median(PC1[ADAC==1], na.rm = TRUE),
-            median_PC1_no_adac = median(PC1[ADAC==0]),
+            median_PC1_no_adac = median(PC1[ADAC==0], na.rm = TRUE),
-            median_PC4 = median(PC4),
+            median_PC4 = median(PC4, na.rm = TRUE),
-            median_PC4_adac = median(PC4[ADAC==1]),
+            median_PC4_adac = median(PC4[ADAC==1], na.rm = TRUE),
-            median_PC4_no_adac = median(PC4[ADAC==0]),
+            median_PC4_no_adac = median(PC4[ADAC==0], na.rm = TRUE),
            n_comments = sum(!is.na(id)),
            n_comments_before = sum(before_close)
            )
@ -221,7 +221,7 @@ descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, 
         source, phase, week_index, author_closer, resolution_outcome, priority,
-         gerrit_repo, task_status)
+         gerrit_repo, status)
 task_level_variables <- task_level_variables |>
  left_join(
@ -242,7 +242,7 @@ task_level_variables <- task_level_variables |>
  )
 # 2. assign sampling prob for different tasks
 # need to ID those selected in the first round of sampling that were removed for the second round of sampling
-large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
+large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv"
 large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
 first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
 # refer to DSL specification sheet 
@ -258,37 +258,10 @@ task_level_variables <- task_level_variables |>
  ) |>
  select(-isFirstSample) |>
  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
-  mutate(TTR = (date_closed - date_created)/3600)
+  mutate(TTR_hours = (date_closed - date_created)/3600)
 # 3. check validity of different aggregate variables 
 mean(task_level_variables$sampling_prob)
 table(task_level_variables$resolution_outcome)
 # look at bivariate plots 
 ggplot(task_level_variables, aes(
  x = as.factor(source),  
  y = week_index,
  fill = resolution_outcome
 )) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of week_index against Resolution Outcome",
    x = "Case",
    y = "Week Index",
    fill = "Resolution Outcome"
  )
 ggplot(task_level_variables, 
       aes(
         x=as.factor(source),
         y=olmo_RK_prop,
         fill=as.factor(source)
       )) + 
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  scale_fill_viridis_d() +
  theme_minimal()
 # 4. save
-write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE)
+write.csv(task_level_variables, "121625_DSL_frame.csv", row.names = FALSE)
--- a/dsl/rq1_plots.R
+++ b/dsl/rq1_plots.R
@ -1,7 +1,7 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
-dsl_csv <-"~/dsl/120725_DSL_frame.csv"
+dsl_csv <-"~/dsl/126725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 dsl_df <- dsl_df |>
--- a/dsl/rq2_plot.R
+++ b/dsl/rq2_plot.R
@ -0,0 +1,53 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
 dsl_csv <-"~/dsl/121625_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 dsl_df <- dsl_df |>
  filter(isAuthorWMF != "BzImport")
 dsl_df_long <- dsl_df %>%
  pivot_longer(
    cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
    names_to = "tag",
    values_to = "proportion"
  ) %>%
  mutate(tag = gsub("olmo_|_prop_adac", "", tag),
         tag = case_when(
           tag == "EP" ~ "Existent Problem",
           tag == "RK" ~ "Record Keeping",
           tag =="TSOL" ~ "Solutions"
         )) 
 olmo_comparison <- ggplot(
  dsl_df_long,
  aes(
    x = tag,
    y = proportion, 
    fill = isAuthorWMF,
  )
 ) + 
  facet_grid(source ~ ., 
             scales = "free_y",
             labeller = labeller(source = c("c1" = "VisualEditor", 
                                            "c2" = "HTTPS-login", 
                                            "c3" = "HTTP-deprecation"))) +
  geom_boxplot() +
  theme_minimal() + 
  scale_fill_viridis_d() +
  labs(
    x = "Tag", 
    y = "% of sentences tagged", 
    color = "Is Author WMF?",
    fill = "Is Author WMF?"
  ) + 
  theme(legend.position = "top")
 olmo_comparison
 ggsave(
  filename = "121625_machine_label_comparison.png",
  plot = olmo_comparison,
  width = 12,    # inches
  height = 6,   # inches
  dpi = 800     # high resolution
 )
--- a/mgaughan-rstudio-server_31856137.out
+++ b/mgaughan-rstudio-server_31856137.out
@ -1,17 +0,0 @@
 1. SSH tunnel from your workstation using the following command:
   ssh -N -L 8787:n3439:46483 mjilg@klone.hyak.uw.edu
   and point your web browser to http://localhost:8787
 2. log in to RStudio Server using the following credentials:
   user: mjilg
   password: NeI7LSiR2rI9GCHZLNWB
 When done using RStudio Server, terminate the job by:
 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:
      scancel -f 31856137
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -1,30 +1,65 @@
 library(tidyverse)
 library(dplyr)
 #neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
 #neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)
-#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
+main_csv <- "~/analysis_data/121625_unified.csv"
 #neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)
 #pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
 #pca_df <- read.csv(pca_csv  , header = TRUE) |> mutate(comment_text = text)
 main_csv <- "~/analysis_data/120725_unified.csv"
 main_df <- read.csv(main_csv  , header = TRUE)
 length(unique(main_df$id))
-main_df |>
+
 preprocess_comment <- function(message) {
  library(stringr)
  comment_text <- message
  # 1. replace code with CODE
  # Inline code: `...`
  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
  # Block code: ```...```
  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
  # 2. replace quotes with QUOTE
  lines <- unlist(strsplit(comment_text, "\n"))
  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
  comment_text <- paste(lines, collapse = "\n")
  # 3. replace Gerrit URLs with GERRIT_URL
  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
  # replace URL with URL
  url_pattern <- "https?://[^\\s]+"
  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
  # 4. replace @screenname with SCREEN_NAME
  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
  return(cleaned_message)
 }
 main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)
 # look at the representative comments for PC1 and PC2
 top5 <- main_df %>%
  arrange(desc(PC3)) %>%
  slice(250:260) %>%
  pull(cleaned_comment)
 bottom5 <- main_df %>%
  arrange(PC3) %>%
  slice(250:260) %>%
  pull(cleaned_comment)
 cat("Top 300:310 comment_text by PC2 score:\n")
 print(top5)
 cat("\nBottom 300:310 comment_text by PC2 score:\n")
 print(bottom5)
 comments_style <- main_df |>
  ggplot(
    aes(
-      x = PC4,
+      x = PC1,
-      y = PC3,
+      y = PC4,
      fill = comment_type
    )
  ) + 
  facet_grid(~source, scales="fixed",
             labeller = as_labeller(c(
-               "c1" = "VisualEditor (c1)",
+               "c1" = "VisualEditor",
-               "c2"   = "HTTPS-as-default (c2)",
+               "c2"   = "HTTPS-login",
-               "c3"  = "HTTP-deprecation (c3)"
+               "c3"  = "HTTP-deprecation"
             ))) +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-50, 50) + 
@ -36,40 +71,54 @@ main_df |>
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
-    title = "PCs for Task Comments by comment type and case",
+    x = "Lengthy Discussion v. Brief Updates (PC1)",
-    x = "Casual v. Formal Updates (PC3)",
+    y = "Technical Jargon v. Non-technical Observations (PC4)",
    y = "Technical-matter v. Procedural Commentary (PC4)",
  )
 ggsave(
  filename = "121625_comments_style.png",
  plot = comments_style,
  width = 12,    # inches
  height = 8,   # inches
  dpi = 800     # high resolution
 )
-main_df |>
+adac_style <- main_df |>
  filter(ADAC == 1) |>
  ggplot(
    aes(
-      x = PC4,
+      x = PC3,
-      y = PC3,
+      y = PC4,
-      fill = as.factor(ADAC)
+      fill = as.factor(isAuthorWMF)
    )
  ) + 
-  facet_grid(comment_type~source,
+  facet_grid(~source,
             labeller = as_labeller(c(
-               "c1" = "VisualEditor (c1)",
+               "c1" = "VisualEditor",
-               "c2"   = "HTTPS-as-default (c2)",
+               "c2"   = "HTTPS-login",
-               "c3"  = "HTTP-deprecation (c3)",
+               "c3"  = "HTTP-deprecation",
               "task_description" = "Task Description",
               "task_subcomment" = "Follow-up Reply"
             ))) +
  geom_point(shape = 21, alpha=0.3, size=2) +
-  scale_fill_viridis_d(
+  xlim(-50, 50) + 
-    name = "Comment Author Affiliation",
+  ylim(-50, 50) +
-    labels = c("Nonaffiliated", "WMF-affiliated"))+
+  scale_fill_viridis_d()+
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
-    title = "PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)",
+    x = "Expressive, first-person v. Dry, third-person (PC3)",
-    x = "Casual v. Formal Updates (PC3)",
+    y = "Technical Jargon v. Non-technical Observations (PC4)",
    y = "Technical-matter v. Procedural Commentary (PC4)",
  )
 #"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
 ggsave(
  filename = "121625_adac_affil_style.png",
  plot = adac_style,
  width = 12,    # inches
  height = 8,   # inches
  dpi = 800     # high resolution
 )
 main_df |>
  filter(comment_type=="task_subcomment") |>
@ -104,221 +153,3 @@ main_df <- main_df |>
    comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
  )
 description_df <- main_df |>
  filter(comment_type == "task_description")
 replies_df <- main_df |>
  filter(comment_type == "task_subcomment") |>
  filter(isGerritBot != TRUE) 
 library(ggplot2)
 ggplot(replies_df, aes(x = PC3, y = PC4, fill = isAuthorWMF)) +
  facet_grid(ADAC~source, scales="fixed") +
  geom_point(shape = 21, alpha=0.15, size=3) +
  xlim(-50, 50) + 
  ylim(-50, 50) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "PCs for Task Comments (Faceted by source (column))",
    x = "PC3",
    y = "PC4",
  )
 replies_df |>
  ggplot(aes(
    x = as.factor(author_closer.y),    # x-axis grouping
    y = PC1.x,
    fill = reso
  )) +
  ylim(-30, 30) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source.x, scales = "fixed") + 
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC4",
    x = "Comment_type",
    y = "PC4",
    fill = "isAuthorWMF?"
  )
 description_df |>
  ggplot(aes(
    x = as.factor(author_closer),    # x-axis grouping
    y = PC4,
    fill = resolution_outcome
  )) +
  facet_grid(  ~ source, scales = "fixed") +
  ylim(-40, 40) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC4",
    x = "Comment_type",
    y = "PC4",
    fill = "isAuthorWMF?"
  )
 main_df <- main_df |>
  select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
 # Join main_df to neurobiber_description_pca_df
 description_joined <- main_df |>
  right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
  filter(comment_text != "nan") #TODO: look at this more in depth
 # Join main_df to neurobiber_subcomment_pca_df
 subcomment_joined <- main_df |>
  right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
  filter(comment_text != "nan") #TODO: look at this more in depth
 total_joined <- main_df |>
  right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
  filter(comment_text != "nan") #TODO: look at this more in depth
 preprocess_comment <- function(message) {
  library(stringr)
  comment_text <- message
  # 1. replace code with CODE
  # Inline code: `...`
  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
  # Block code: ```...```
  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
  # 2. replace quotes with QUOTE
  lines <- unlist(strsplit(comment_text, "\n"))
  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
  comment_text <- paste(lines, collapse = "\n")
  # 3. replace Gerrit URLs with GERRIT_URL
  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
  # replace URL with URL
  url_pattern <- "https?://[^\\s]+"
  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
  # 4. replace @screenname with SCREEN_NAME
  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
  return(cleaned_message)
 }
 # Add comment_type column to each df
 neurobiber_description_pca_df$comment_type <- "task_description"
 neurobiber_subcomment_pca_df$comment_type <- "subcomment"
 #clean the messages 
 neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
 neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
 total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
 subcomment_joined <- subcomment_joined %>%
  mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
                                  paste(neurobiber_description_pca_df$AuthorPHID,
                                        neurobiber_description_pca_df$TaskPHID)))
 # look at correlation between PC1, PC2, and different outcome variables 
 description_anova_results <- neurobiber_description_pca_df %>%
  group_by(source) %>%
  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
 description_anova_results
 discussion_anova_results <- neurobiber_subcomment_pca_df %>%
  group_by(source) %>%
  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
 discussion_anova_results
 # look at the representative comments for PC1 and PC2
 top5 <- total_joined %>%
  arrange(desc(PC4)) %>%
  slice(300:310) %>%
  pull(cleaned_comment)
 bottom5 <- total_joined %>%
  arrange(PC4) %>%
  slice(300:310) %>%
  pull(cleaned_comment)
 cat("Top 300:310 comment_text by PC2 score:\n")
 print(top5)
 cat("\nBottom 300:310 comment_text by PC2 score:\n")
 print(bottom5)
 library(scales)
 library(ggplot2)
 affiliationColors <-
  setNames( c('#5da2d8', '#c7756a')
            ,c("False", "True"))
 subcomment_joined_no_gerrit <- subcomment_joined |>
  filter(isGerritBot != "TRUE") |>
  left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
 #unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
 #unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
 # geom_point(shape = 21, alpha=0.4, size=2) +
 # geom_bin_2d() +
 sampled_authors <- subcomment_joined_no_gerrit %>%
  distinct(AuthorPHID) %>%
  sample_n(100) %>%
  pull(AuthorPHID)
 # 2. Filter original data to just those authors
 sub_sample <- subcomment_joined_no_gerrit %>%
  filter(AuthorPHID %in% sampled_authors)
 description_sampled_authors <- description_joined %>%
  distinct(AuthorPHID) %>%
  sample_n(8) %>%
  pull(AuthorPHID)
 # 2. Filter original data to just those authors
 description_sub_sample <- description_joined %>%
  filter(AuthorPHID %in% description_sampled_authors)
 ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
  facet_grid(source~phase, scales="fixed") +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-30, 30) + 
  ylim(-30, 30) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "PCs for Task Comments (Faceted by source and phase)",
    x = "PC4",
    y = "PC3",
  )
 priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
 subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
  mutate(priority = factor(priority, levels = priority_order))
 description_joined <- description_joined %>%
  mutate(priority = factor(priority.y, levels = priority_order))
 ggplot(total_joined, aes(
  x = as.factor(comment_type),    # x-axis grouping
  y = PC3,
  fill = isAuthorWMF
 )) +
  ylim(-30, 30) +
  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(
    title = "Boxplot of PC4",
    x = "Comment_type",
    y = "PC4",
    fill = "isAuthorWMF?"
  )