From acd8964e73073da8a2c8f7bcdf88e954f50ffaea Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login03.hyak.local>
Date: Thu, 25 Sep 2025 14:09:39 -0700
Subject: [PATCH] preliminary EDA on the PCA analysis

---
 mgaughan-rstudio-server_29836350.out | 17 +++++++++
 p2/quest/neurobiber_PCA_analysis.R   | 54 ++++++++++++++++------------
 2 files changed, 48 insertions(+), 23 deletions(-)
 create mode 100644 mgaughan-rstudio-server_29836350.out

diff --git a/mgaughan-rstudio-server_29836350.out b/mgaughan-rstudio-server_29836350.out
new file mode 100644
index 0000000..be772df
--- /dev/null
+++ b/mgaughan-rstudio-server_29836350.out
@@ -0,0 +1,17 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3439:44313 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: inZLmycTjFs4aqmFIMs1
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 29836350
diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R
index b547d17..de74d4b 100644
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@@ -1,12 +1,19 @@
 library(tidyverse)
 
-neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv"
+neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
 neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE) 
 
-neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv"
+neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
 neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE)
 
 
+# Add comment_type column to each df
+neurobiber_description_pca_df$comment_type <- "task_description"
+neurobiber_subcomment_pca_df$comment_type <- "subcomment"
+
+# Combine them
+unified_df <- rbind(neurobiber_description_pca_df, neurobiber_subcomment_pca_df)
+
 # look at correlation between PC1, PC2, and different outcome variables 
 library(dplyr)
 description_anova_results <- neurobiber_description_pca_df %>%
@@ -20,27 +27,25 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>%
 discussion_anova_results
 
 # look at the representative comments for PC1 and PC2
-top5 <- neurobiber_subcomment_pca_df %>%
-  filter(source=="c2") |>
-  arrange(desc(PC2)) %>%
-  slice(15:30) %>%
+top5 <- neurobiber_description_pca_df %>%
+  arrange(desc(PC1)) %>%
+  slice(500:510) %>%
   pull(text)
 
-bottom5 <- neurobiber_subcomment_pca_df %>%
-  filter(source=="c2") |>
-  arrange(PC2) %>%
-  slice(15:30) %>%
+bottom5 <- neurobiber_description_pca_df %>%
+  arrange(PC1) %>%
+  slice(500:510) %>%
   pull(text)
 
-cat("Top 15:30 comment_text by score:\n")
+cat("Top 10:20 comment_text by score:\n")
 print(top5)
 
-cat("\nBottom 15:30 comment_text by score:\n")
+cat("\nBottom 10:20 comment_text by score:\n")
 print(bottom5)
 
 
 aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
-  group_by(AuthorWMFAffil, week_index, source, priority) %>%
+  group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
   summarise(mean_PC1 = median(PC1),
             mean_PC2 = median(PC2),
             mean_PC3 = median(PC3),
@@ -62,13 +67,16 @@ long_df <- aggregated_neurobiber_description_pca_df %>%
     values_to = "PC_value"
   )
 
-ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) +
-  geom_line(size = 1) +
-  facet_grid(PC ~ source, scales = "free_y") +
-  scale_color_manual(values = affiliationColors, name = "WMF Affiliation") +
-  scale_x_continuous(breaks = pretty_breaks()) +
-  scale_y_continuous(limits = c(-10, 10)) + 
-  labs(x = "Week Index", y = "Mean PC Value",
-       title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") +
-  theme_minimal(base_size = 14) +
-  theme(legend.position = "top")
\ No newline at end of file
+unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
+unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
+ggplot(unified_df, aes(x = PC3, y = PC4, fill = AuthorWMFAffil)) +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  facet_grid(source ~ phase) +
+  scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
+  theme_minimal() +
+  labs(
+    title = "PCs for All Comments (Faceted by Source and Phase)",
+    x = "PC3",
+    y = "PC4",
+    fill = "Comment Type"
+  )