updating new analysis with re-labeled data, gerrit is out and bzimport is its own thing

2025-12-16 17:55:51 -08:00 · 2025-12-16 17:55:51 -08:00 · 1584e2cd5f
commit 1584e2cd5f
parent df1dcf1224
21 changed files with 381134 additions and 322 deletions
--- a/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv
+++ b/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv
--- a/analysis_data/121625_data_unification.R
+++ b/analysis_data/121625_data_unification.R
@ -82,19 +82,20 @@ main_df <- main_df |>
  )

 #getting PC values (need todo after revised pass)
-pca_csv <- "~/analysis_data/121625_constituent_dfs/121525_total_pca_df.csv"
+pca_csv <- "~/analysis_data/121625_constituent_dfs/121625_total_pca_df.csv"
 pca_df <- read.csv(pca_csv, header = TRUE) 
 length(unique(pca_df$id))
 pca_df <- pca_df |>
  select(starts_with("PC"),
         id)

-#first_join <- main_df|>
-#  left_join(
-#    pca_df,
-#    by = "id"
-#  )
+first_join <- main_df|>
+  left_join(
+    pca_df,
+    by = "id"
+  )

+length(unique(first_join$id))
 olmo_csv <- "~/analysis_data/121625_constituent_dfs/all_120525_olmo_batched_categorized.csv"
 olmo_df <- read.csv(olmo_csv, header = TRUE) 

@ -103,7 +104,7 @@ olmo_df <- olmo_df |>
         olmo_sentence_labels = sentence_categories)|>
  select(id, olmo_cleaned_sentences, olmo_sentence_labels)

-second_join <- main_df|>
+second_join <- first_join |>
  left_join(
    olmo_df,
    by = "id"
@ -163,4 +164,4 @@ unified_df <- unified_df |>
    gerrit_repo = str_extract(selected_gerrit_results, "(?<='project': ')[^']+")
  )

-write.csv(unified_df, "forPCA_121625_unified.csv", row.names = FALSE)
+write.csv(unified_df, "121625_unified.csv", row.names = FALSE)
--- a/analysis_data/121625_unified.csv
+++ b/analysis_data/121625_unified.csv
--- a/analysis_data/stale_unifieds/forPCA_121625_unified.csv
+++ b/analysis_data/stale_unifieds/forPCA_121625_unified.csv
--- a/doc_plots/121625_dsl_coefs.png
+++ b/doc_plots/121625_dsl_coefs.png
--- a/doc_plots/rq2_plots/121625_adac_affil_style.png
+++ b/doc_plots/rq2_plots/121625_adac_affil_style.png
--- a/doc_plots/rq2_plots/121625_comments_style.png
+++ b/doc_plots/rq2_plots/121625_comments_style.png
--- a/doc_plots/rq2_plots/121625_machine_label_comparison.png
+++ b/doc_plots/rq2_plots/121625_machine_label_comparison.png
--- a/dsl/121625_DSL_frame.csv
+++ b/dsl/121625_DSL_frame.csv
--- a/dsl/121625_final_dsl.R
+++ b/dsl/121625_final_dsl.R
@ -0,0 +1,100 @@
+library(tidyverse)
+library(dsl)
+
+dsl_csv <-"~/dsl/121625_DSL_frame.csv"
+dsl_df <- read.csv(dsl_csv, header = TRUE) 
+
+dsl_df <- dsl_df |>
+  dplyr::mutate(ttr_days = TTR_hours / 24) |>
+  dplyr::mutate(task_resolution = dsl_score)
+
+dev_model <- dsl(
+  model = "logit", 
+  formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac 
+    + median_PC4_adac + median_PC3_adac + n_comments_before 
+    + median_gerrit_reviewers + week_index + as.factor(isAuthorWMF) * as.factor(source),
+  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
+  prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
+  sample_prob = "sampling_prob", 
+  cluster="source",
+  cross_fit = 3,
+  sample_split = 20,
+  data=dsl_df
+)
+summary(dev_model)
+#saveRDS(dev_model, "121625_logit_dsl.RDS")
+dev_model <- readRDS("dsl/121625_logit_dsl.RDS")
+library(broom)
+library(dplyr)
+tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
+  res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
+  terms <- row.names(res)
+  cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
+  if (conf.int) {
+    cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
+  }
+  out <- as.list(res)[cols]
+  names(out) <- names(cols)
+  out <- as_tibble(as.data.frame(out))
+  out <- dplyr::bind_cols(term = terms, out)
+  if (exponentiate)
+    out <- broom:::exponentiate(out)
+  return(out)
+}
+coef_df <- tidy.dsl(dev_model)
+coef_df <- coef_df |>
+  mutate(
+    term = recode(term,
+                  "week_index" = "Weeks from deployment",
+                  "(Intercept)" = "Intercept",
+                  "n_comments_before" = "# of comments prior to resolution",
+                  "median_PC4_adac" = "Median Author PC4 Pre-resolution",
+                  "median_PC3_adac" = "Median Author PC3 Pre-resolution",
+                  "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
+                  "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
+                  "human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'",
+                  "human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'",
+                  "as.factor(source)c3" = "HTTP-deprecation (factor)",
+                  "as.factor(source)c2" = "HTTPS-login (factor)",
+                  "as.factor(isAuthorWMF)TRUE" = "WMF-affiliated Author (factor)",
+                  "as.factor(isAuthorWMF)FALSE" = "Nonaffiliated Author (factor)",
+                  "as.factor(isAuthorWMF)FALSE:as.factor(source)c2" = "Nonaffiliated Author:HTTPS-login",
+                  "as.factor(isAuthorWMF)FALSE:as.factor(source)c3" = "Nonaffiliated Author:HTTP-deprecation",
+                  "as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliated Author:HTTPS-login",
+                  "as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliated Author:HTTP-deprecation",
+                  ),
+    term = factor(term, levels = rev(c(
+      "Intercept",   
+      "% of sentences discussing 'Existent Problems'",        
+      "% of sentences discussing 'Solutions'",
+      "% of sentences discussing 'Record Keeping'",
+      "Median Author PC4 Pre-resolution",
+      "Median Author PC3 Pre-resolution",
+      "# of comments prior to resolution",
+      "Median # of Code Reviewers (Gerrit)",
+      "Weeks from deployment",
+      "HTTPS-login (factor)",
+      "HTTP-deprecation (factor)",
+      "Nonaffiliated Author (factor)",
+      "WMF-affiliated Author (factor)",
+      "Nonaffiliated Author:HTTPS-login",
+      "WMF-affiliated Author:HTTPS-login",
+      "Nonaffiliated Author:HTTP-deprecation",
+      "WMF-affiliated Author:HTTP-deprecation"
+    )))
+  )
+dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
+  geom_point(size = 1) +
+  geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
+  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
+  labs(x = "Log-odds Coefficient Estimate",
+       y = "Variable") +
+  theme_minimal()
+dsl_coefs
+ggsave(
+  filename = "121625_dsl_coefs.png",
+  plot = dsl_coefs,
+  width = 6,    # inches
+  height = 6,   # inches
+  dpi = 800     # high resolution
+)
--- a/dsl/121625_logit_dsl.RDS
+++ b/dsl/121625_logit_dsl.RDS
--- a/dsl/archived_dsl_data/110925_DSL_df_adac.csv
+++ b/dsl/archived_dsl_data/110925_DSL_df_adac.csv
--- a/dsl/archived_dsl_data/111725_DSL_frame.csv
+++ b/dsl/archived_dsl_data/111725_DSL_frame.csv
--- a/dsl/archived_dsl_data/120725_DSL_frame.csv
+++ b/dsl/archived_dsl_data/120725_DSL_frame.csv
--- a/dsl/archived_dsl_data/120725_logit_dsl.RDS
+++ b/dsl/archived_dsl_data/120725_logit_dsl.RDS
--- a/dsl/dsl.R
+++ b/dsl/dsl.R
@ -1,7 +1,7 @@
 library(tidyverse)
 library(dsl)

-dsl_csv <-"~/dsl/120725_DSL_frame.csv"
+dsl_csv <-"~/dsl/121625_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 

 dsl_df <- dsl_df |>
@ -69,7 +69,7 @@ summary(felm_model)
 dev_model <- dsl(
  model = "logit", 
  formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac 
-    + median_PC4_adac + median_PC3_adac + n_comments_before 
+    + median_PC4_adac + median_PC3_adac + median_PC1_adac + n_comments_before 
    + median_gerrit_reviewers + median_gerrit_loc_delta
    + week_index + as.factor(isAuthorWMF) * as.factor(source),
  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
@ -80,9 +80,9 @@ dev_model <- dsl(
  sample_split = 20,
  data=dsl_df
 )
-#summary(dev_model)
+summary(dev_model)
 #saveRDS(dev_model, "120725_logit_dsl.RDS")
-dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
+#dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
 library(broom)
 library(dplyr)
 tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
@ -109,6 +109,7 @@ coef_df <- coef_df |>
                  "n_comments_before" = "# of comments prior to resolution",
                  "median_PC4_adac" = "Median Author PC4 Pre-resolution",
                  "median_PC3_adac" = "Median Author PC3 Pre-resolution",
+                  "median_PC1_adac" = "Median Author PC1 Pre-resolution",
                  "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
                  "median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)",
                  "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
@ -127,6 +128,7 @@ coef_df <- coef_df |>
      "% of sentences discussing 'Record Keeping'",
      "Median Author PC4 Pre-resolution",
      "Median Author PC3 Pre-resolution",
+      "Median Author PC1 Pre-resolution",
      "# of comments prior to resolution",
      "Median # of Code Reviewers (Gerrit)",
      "Median LoC Changed (Gerrit)",
--- a/dsl/dsl_aggregation.R
+++ b/dsl/dsl_aggregation.R
@ -1,6 +1,6 @@
 library(tidyverse)

-unified_csv <-"~/analysis_data/120725_unified.csv"
+unified_csv <-"~/analysis_data/121625_unified.csv"
 unified_df <- read.csv(unified_csv, header = TRUE) 

 # 1. aggregate to the task level 
@ -9,7 +9,7 @@ unified_df <- read.csv(unified_csv, header = TRUE)
 #   1c. 
 valid_categories <- c('EXPECTED BEHAVIOR', 'MOTIVATION','OBSERVED BUG BEHAVIOR',
                      'BUG REPRODUCTION', 'INVESTIGATION AND EXPLORATION', 'SOLUTION DISCUSSION',
-                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN',
+                      'CONTRIBUTION AND COMMITMENT', 'TASK PROGRESS', 'TESTING', 'FUTURE PLAN', 'FUTURE PLANS',
                      'POTENTIAL NEW ISSUES AND REQUESTS', 'SOLUTION USAGE',
                      'WORKAROUNDS', 'ISSUE CONTENT MANAGEMENT', 'ACTION ON ISSUE',
                      'SOCIAL CONVERSATION')
@ -204,15 +204,15 @@ task_level_variables <- unified_df |>
  group_by(TaskPHID) |>
  summarise(median_gerrit_loc_delta = median(gerrit_code_insertions + gerrit_code_deletions, na.rm = TRUE),
            median_gerrit_reviewers = median(gerrit_reviewer_count, na.rm = TRUE),
-            median_PC3 = median(PC3),
-            median_PC3_adac = median(PC3[ADAC==1]),
-            median_PC3_no_adac = median(PC3[ADAC==0]),
-            median_PC1 = median(PC1),
-            median_PC1_adac = median(PC1[ADAC==1]),
-            median_PC1_no_adac = median(PC1[ADAC==0]),
-            median_PC4 = median(PC4),
-            median_PC4_adac = median(PC4[ADAC==1]),
-            median_PC4_no_adac = median(PC4[ADAC==0]),
+            median_PC3 = median(PC3, na.rm = TRUE),
+            median_PC3_adac = median(PC3[ADAC==1], na.rm = TRUE),
+            median_PC3_no_adac = median(PC3[ADAC==0], na.rm = TRUE),
+            median_PC1 = median(PC1, na.rm = TRUE),
+            median_PC1_adac = median(PC1[ADAC==1], na.rm = TRUE),
+            median_PC1_no_adac = median(PC1[ADAC==0], na.rm = TRUE),
+            median_PC4 = median(PC4, na.rm = TRUE),
+            median_PC4_adac = median(PC4[ADAC==1], na.rm = TRUE),
+            median_PC4_no_adac = median(PC4[ADAC==0], na.rm = TRUE),
            n_comments = sum(!is.na(id)),
            n_comments_before = sum(before_close)
            )
@ -221,7 +221,7 @@ descriptions <- unified_df |>
  filter(comment_type == "task_description")|>
  select(TaskPHID, task_title, date_created, date_closed, isAuthorWMF, 
         source, phase, week_index, author_closer, resolution_outcome, priority,
-         gerrit_repo, task_status)
+         gerrit_repo, status)

 task_level_variables <- task_level_variables |>
  left_join(
@ -242,7 +242,7 @@ task_level_variables <- task_level_variables |>
  )
 # 2. assign sampling prob for different tasks
 # need to ID those selected in the first round of sampling that were removed for the second round of sampling
-large_human_labels_csv <- "~/analysis_data/102125_constituent_dfs/102025_human_labels.csv"
+large_human_labels_csv <- "~/analysis_data/121625_constituent_dfs/102025_human_labels.csv"
 large_human_labels_df <- read.csv(large_human_labels_csv, header = TRUE)
 first_sample_tasks <- unique(as.character(large_human_labels_df$TaskPHID))
 # refer to DSL specification sheet 
@ -258,37 +258,10 @@ task_level_variables <- task_level_variables |>
  ) |>
  select(-isFirstSample) |>
  mutate(dsl_score = ifelse(resolution_outcome == "TRUE", 1, 0)) |>
-  mutate(TTR = (date_closed - date_created)/3600)
+  mutate(TTR_hours = (date_closed - date_created)/3600)
 # 3. check validity of different aggregate variables 
 mean(task_level_variables$sampling_prob)
 table(task_level_variables$resolution_outcome)
-# look at bivariate plots 
-ggplot(task_level_variables, aes(
-  x = as.factor(source),  
-  y = week_index,
-  fill = resolution_outcome
-)) +
-  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
-  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "Boxplot of week_index against Resolution Outcome",
-    x = "Case",
-    y = "Week Index",
-    fill = "Resolution Outcome"
-  )
-
-
-ggplot(task_level_variables, 
-       aes(
-         x=as.factor(source),
-         y=olmo_RK_prop,
-         fill=as.factor(source)
-       )) + 
-  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
-  scale_fill_viridis_d() +
-  theme_minimal()

 # 4. save
-write.csv(task_level_variables, "120725_DSL_frame.csv", row.names = FALSE)
+write.csv(task_level_variables, "121625_DSL_frame.csv", row.names = FALSE)
--- a/dsl/rq1_plots.R
+++ b/dsl/rq1_plots.R
@ -1,7 +1,7 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
-dsl_csv <-"~/dsl/120725_DSL_frame.csv"
+dsl_csv <-"~/dsl/126725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 

 dsl_df <- dsl_df |>
--- a/dsl/rq2_plot.R
+++ b/dsl/rq2_plot.R
@ -0,0 +1,53 @@
+library(tidyverse)
+#library(dsl)
+library(dplyr)
+dsl_csv <-"~/dsl/121625_DSL_frame.csv"
+dsl_df <- read.csv(dsl_csv, header = TRUE) 
+
+dsl_df <- dsl_df |>
+  filter(isAuthorWMF != "BzImport")
+
+dsl_df_long <- dsl_df %>%
+  pivot_longer(
+    cols = c(olmo_EP_prop_adac, olmo_RK_prop_adac, olmo_TSOL_prop_adac),
+    names_to = "tag",
+    values_to = "proportion"
+  ) %>%
+  mutate(tag = gsub("olmo_|_prop_adac", "", tag),
+         tag = case_when(
+           tag == "EP" ~ "Existent Problem",
+           tag == "RK" ~ "Record Keeping",
+           tag =="TSOL" ~ "Solutions"
+         )) 
+
+olmo_comparison <- ggplot(
+  dsl_df_long,
+  aes(
+    x = tag,
+    y = proportion, 
+    fill = isAuthorWMF,
+  )
+) + 
+  facet_grid(source ~ ., 
+             scales = "free_y",
+             labeller = labeller(source = c("c1" = "VisualEditor", 
+                                            "c2" = "HTTPS-login", 
+                                            "c3" = "HTTP-deprecation"))) +
+  geom_boxplot() +
+  theme_minimal() + 
+  scale_fill_viridis_d() +
+  labs(
+    x = "Tag", 
+    y = "% of sentences tagged", 
+    color = "Is Author WMF?",
+    fill = "Is Author WMF?"
+  ) + 
+  theme(legend.position = "top")
+olmo_comparison
+ggsave(
+  filename = "121625_machine_label_comparison.png",
+  plot = olmo_comparison,
+  width = 12,    # inches
+  height = 6,   # inches
+  dpi = 800     # high resolution
+)
--- a/mgaughan-rstudio-server_31856137.out
+++ b/mgaughan-rstudio-server_31856137.out
@ -1,17 +0,0 @@
-1. SSH tunnel from your workstation using the following command:
-
-   ssh -N -L 8787:n3439:46483 mjilg@klone.hyak.uw.edu
-
-   and point your web browser to http://localhost:8787
-
-2. log in to RStudio Server using the following credentials:
-
-   user: mjilg
-   password: NeI7LSiR2rI9GCHZLNWB
-
-When done using RStudio Server, terminate the job by:
-
-1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
-2. Issue the following command on the login node:
-
-      scancel -f 31856137
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -1,30 +1,65 @@
 library(tidyverse)
 library(dplyr)
-#neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
-#neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)

-#neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
-#neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)
-
-#pca_csv <- "~/p2/quest/102025_total_pca_df.csv"
-#pca_df <- read.csv(pca_csv  , header = TRUE) |> mutate(comment_text = text)
-
-main_csv <- "~/analysis_data/120725_unified.csv"
+main_csv <- "~/analysis_data/121625_unified.csv"
 main_df <- read.csv(main_csv  , header = TRUE)
 length(unique(main_df$id))
-main_df |>
+
+preprocess_comment <- function(message) {
+  library(stringr)
+  comment_text <- message
+  # 1. replace code with CODE
+  # Inline code: `...`
+  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
+  # Block code: ```...```
+  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
+  # 2. replace quotes with QUOTE
+  lines <- unlist(strsplit(comment_text, "\n"))
+  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
+  comment_text <- paste(lines, collapse = "\n")
+  # 3. replace Gerrit URLs with GERRIT_URL
+  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
+  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
+  # replace URL with URL
+  url_pattern <- "https?://[^\\s]+"
+  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
+  # 4. replace @screenname with SCREEN_NAME
+  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
+  return(cleaned_message)
+}
+main_df$cleaned_comment <- sapply(main_df$comment_text, preprocess_comment)
+
+# look at the representative comments for PC1 and PC2
+top5 <- main_df %>%
+  arrange(desc(PC3)) %>%
+  slice(250:260) %>%
+  pull(cleaned_comment)
+
+bottom5 <- main_df %>%
+  arrange(PC3) %>%
+  slice(250:260) %>%
+  pull(cleaned_comment)
+
+cat("Top 300:310 comment_text by PC2 score:\n")
+print(top5)
+
+cat("\nBottom 300:310 comment_text by PC2 score:\n")
+print(bottom5)
+
+
+comments_style <- main_df |>
  ggplot(
    aes(
-      x = PC4,
-      y = PC3,
+      x = PC1,
+      y = PC4,
      fill = comment_type
    )
  ) + 
  facet_grid(~source, scales="fixed",
             labeller = as_labeller(c(
-               "c1" = "VisualEditor (c1)",
-               "c2"   = "HTTPS-as-default (c2)",
-               "c3"  = "HTTP-deprecation (c3)"
+               "c1" = "VisualEditor",
+               "c2"   = "HTTPS-login",
+               "c3"  = "HTTP-deprecation"
             ))) +
  geom_point(shape = 21, alpha=0.3, size=2) +
  xlim(-50, 50) + 
@ -36,40 +71,54 @@ main_df |>
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
-    title = "PCs for Task Comments by comment type and case",
-    x = "Casual v. Formal Updates (PC3)",
-    y = "Technical-matter v. Procedural Commentary (PC4)",
+    x = "Lengthy Discussion v. Brief Updates (PC1)",
+    y = "Technical Jargon v. Non-technical Observations (PC4)",
  )
+ggsave(
+  filename = "121625_comments_style.png",
+  plot = comments_style,
+  width = 12,    # inches
+  height = 8,   # inches
+  dpi = 800     # high resolution
+)

-main_df |>
+adac_style <- main_df |>
  filter(ADAC == 1) |>
  ggplot(
    aes(
-      x = PC4,
-      y = PC3,
-      fill = as.factor(ADAC)
+      x = PC3,
+      y = PC4,
+      fill = as.factor(isAuthorWMF)
    )
  ) + 
-  facet_grid(comment_type~source,
+  facet_grid(~source,
             labeller = as_labeller(c(
-               "c1" = "VisualEditor (c1)",
-               "c2"   = "HTTPS-as-default (c2)",
-               "c3"  = "HTTP-deprecation (c3)",
+               "c1" = "VisualEditor",
+               "c2"   = "HTTPS-login",
+               "c3"  = "HTTP-deprecation",
               "task_description" = "Task Description",
               "task_subcomment" = "Follow-up Reply"
             ))) +
  geom_point(shape = 21, alpha=0.3, size=2) +
-  scale_fill_viridis_d(
-    name = "Comment Author Affiliation",
-    labels = c("Nonaffiliated", "WMF-affiliated"))+
+  xlim(-50, 50) + 
+  ylim(-50, 50) +
+  scale_fill_viridis_d()+
  theme_minimal() +
  theme(legend.position = "top") +
  labs(
-    title = "PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)",
-    x = "Casual v. Formal Updates (PC3)",
-    y = "Technical-matter v. Procedural Commentary (PC4)",
+    x = "Expressive, first-person v. Dry, third-person (PC3)",
+    y = "Technical Jargon v. Non-technical Observations (PC4)",
  )
 #"PCs for Pre-Resolution Comments Written by Task Author (by Author Affiliation, Case, and Comment Type)"
+ggsave(
+  filename = "121625_adac_affil_style.png",
+  plot = adac_style,
+  width = 12,    # inches
+  height = 8,   # inches
+  dpi = 800     # high resolution
+)
+
+

 main_df |>
  filter(comment_type=="task_subcomment") |>
@ -104,221 +153,3 @@ main_df <- main_df |>
    comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+"))
  )

-
-
-
-description_df <- main_df |>
-  filter(comment_type == "task_description")
-
-replies_df <- main_df |>
-  filter(comment_type == "task_subcomment") |>
-  filter(isGerritBot != TRUE) 
-
-library(ggplot2)
-ggplot(replies_df, aes(x = PC3, y = PC4, fill = isAuthorWMF)) +
-  facet_grid(ADAC~source, scales="fixed") +
-  geom_point(shape = 21, alpha=0.15, size=3) +
-  xlim(-50, 50) + 
-  ylim(-50, 50) +
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "PCs for Task Comments (Faceted by source (column))",
-    x = "PC3",
-    y = "PC4",
-  )
-
-
-replies_df |>
-  ggplot(aes(
-    x = as.factor(author_closer.y),    # x-axis grouping
-    y = PC1.x,
-    fill = reso
-  )) +
-  ylim(-30, 30) +
-  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
-  facet_grid(. ~ source.x, scales = "fixed") + 
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "Boxplot of PC4",
-    x = "Comment_type",
-    y = "PC4",
-    fill = "isAuthorWMF?"
-  )
-
-description_df |>
-  ggplot(aes(
-    x = as.factor(author_closer),    # x-axis grouping
-    y = PC4,
-    fill = resolution_outcome
-  )) +
-  facet_grid(  ~ source, scales = "fixed") +
-  ylim(-40, 40) +
-  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "Boxplot of PC4",
-    x = "Comment_type",
-    y = "PC4",
-    fill = "isAuthorWMF?"
-  )
-
-
-main_df <- main_df |>
-  select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
-# Join main_df to neurobiber_description_pca_df
-description_joined <- main_df |>
-  right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
-  filter(comment_text != "nan") #TODO: look at this more in depth
-
-# Join main_df to neurobiber_subcomment_pca_df
-subcomment_joined <- main_df |>
-  right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
-  filter(comment_text != "nan") #TODO: look at this more in depth
-
-total_joined <- main_df |>
-  right_join(pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
-  filter(comment_text != "nan") #TODO: look at this more in depth
-
-
-preprocess_comment <- function(message) {
-  library(stringr)
-  comment_text <- message
-  # 1. replace code with CODE
-  # Inline code: `...`
-  comment_text <- str_replace_all(comment_text, "`[^`]+`", "CODE")
-  # Block code: ```...```
-  comment_text <- str_replace_all(comment_text, "```[\\s\\S]+?```", "CODE")
-  # 2. replace quotes with QUOTE
-  lines <- unlist(strsplit(comment_text, "\n"))
-  lines <- ifelse(str_detect(str_trim(lines), "^>"), "QUOTE", lines)
-  comment_text <- paste(lines, collapse = "\n")
-  # 3. replace Gerrit URLs with GERRIT_URL
-  gerrit_url_pattern <- "https://gerrit\\.wikimedia\\.org/r/\\d+"
-  comment_text <- str_replace_all(comment_text, gerrit_url_pattern, "GERRIT_URL")
-  # replace URL with URL
-  url_pattern <- "https?://[^\\s]+"
-  comment_text <- str_replace_all(comment_text, url_pattern, "URL")
-  # 4. replace @screenname with SCREEN_NAME
-  cleaned_message <- str_replace_all(comment_text, "(^|\\s)@\\w+", "SCREEN_NAME")
-  return(cleaned_message)
-}
-
-# Add comment_type column to each df
-neurobiber_description_pca_df$comment_type <- "task_description"
-neurobiber_subcomment_pca_df$comment_type <- "subcomment"
-
-#clean the messages 
-neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
-neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
-total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
-
-subcomment_joined <- subcomment_joined %>%
-  mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
-                                  paste(neurobiber_description_pca_df$AuthorPHID,
-                                        neurobiber_description_pca_df$TaskPHID)))
-
-# look at correlation between PC1, PC2, and different outcome variables 
-description_anova_results <- neurobiber_description_pca_df %>%
-  group_by(source) %>%
-  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
-description_anova_results
-
-discussion_anova_results <- neurobiber_subcomment_pca_df %>%
-  group_by(source) %>%
-  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
-discussion_anova_results
-
-# look at the representative comments for PC1 and PC2
-top5 <- total_joined %>%
-  arrange(desc(PC4)) %>%
-  slice(300:310) %>%
-  pull(cleaned_comment)
-
-bottom5 <- total_joined %>%
-  arrange(PC4) %>%
-  slice(300:310) %>%
-  pull(cleaned_comment)
-
-cat("Top 300:310 comment_text by PC2 score:\n")
-print(top5)
-
-cat("\nBottom 300:310 comment_text by PC2 score:\n")
-print(bottom5)
-
-
-library(scales)
-library(ggplot2)
-
-
-affiliationColors <-
-  setNames( c('#5da2d8', '#c7756a')
-            ,c("False", "True"))
-
-subcomment_joined_no_gerrit <- subcomment_joined |>
-  filter(isGerritBot != "TRUE") |>
-  left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
-
-
-#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
-#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
-# geom_point(shape = 21, alpha=0.4, size=2) +
-# geom_bin_2d() +
-
-sampled_authors <- subcomment_joined_no_gerrit %>%
-  distinct(AuthorPHID) %>%
-  sample_n(100) %>%
-  pull(AuthorPHID)
-
-# 2. Filter original data to just those authors
-sub_sample <- subcomment_joined_no_gerrit %>%
-  filter(AuthorPHID %in% sampled_authors)
-
-description_sampled_authors <- description_joined %>%
-  distinct(AuthorPHID) %>%
-  sample_n(8) %>%
-  pull(AuthorPHID)
-
-# 2. Filter original data to just those authors
-description_sub_sample <- description_joined %>%
-  filter(AuthorPHID %in% description_sampled_authors)
-
-ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
-  facet_grid(source~phase, scales="fixed") +
-  geom_point(shape = 21, alpha=0.3, size=2) +
-  xlim(-30, 30) + 
-  ylim(-30, 30) +
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "PCs for Task Comments (Faceted by source and phase)",
-    x = "PC4",
-    y = "PC3",
-  )
-
-priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
-
-subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
-  mutate(priority = factor(priority, levels = priority_order))
-
-description_joined <- description_joined %>%
-  mutate(priority = factor(priority.y, levels = priority_order))
-
-ggplot(total_joined, aes(
-  x = as.factor(comment_type),    # x-axis grouping
-  y = PC3,
-  fill = isAuthorWMF
-)) +
-  ylim(-30, 30) +
-  geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
-  facet_grid(. ~ source, scales = "fixed") +   # Facet by source; adjust as needed
-  scale_fill_viridis_d() +
-  theme_minimal() +
-  labs(
-    title = "Boxplot of PC4",
-    x = "Comment_type",
-    y = "PC4",
-    fill = "isAuthorWMF?"
-  )