From 7f89fd1966628435e641e5e9c8d9d008768b9263 Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login01.hyak.local>
Date: Wed, 1 Oct 2025 20:58:55 -0700
Subject: [PATCH] updated PCA analysis, ready for rob tomorrow

---
 dsl/human_sampling.R                          | 23 ++++++++
 ...ut => mgaughan-rstudio-server_29944433.out |  6 +-
 p2/quest/neurobiber_PCA_analysis.R            | 56 ++++++++++---------
 3 files changed, 55 insertions(+), 30 deletions(-)
 create mode 100644 dsl/human_sampling.R
 rename mgaughan-rstudio-server_29920945.out => mgaughan-rstudio-server_29944433.out (77%)

diff --git a/dsl/human_sampling.R b/dsl/human_sampling.R
new file mode 100644
index 0000000..1d3b9e9
--- /dev/null
+++ b/dsl/human_sampling.R
@@ -0,0 +1,23 @@
+library(tidyverse)
+
+main_csv <-"~/analysis_data/092925_unified_phab.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+set.seed(123) # For reproducibility
+
+sampled_df <- main_df %>%
+  group_by(source) %>%
+  mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>%
+  ungroup() %>%
+  filter(sampled_TaskPHID) %>%
+  select(-sampled_TaskPHID) 
+
+sentence_level_sample <- sampled_df |>
+  mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |>
+  unnest(cleaned_sentences)|>
+  filter(cleaned_sentences != ", ") |>
+  select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit"))
+
+(nrow(sentence_level_sample) / 293) * 1.5
+
+#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE)
\ No newline at end of file
diff --git a/mgaughan-rstudio-server_29920945.out b/mgaughan-rstudio-server_29944433.out
similarity index 77%
rename from mgaughan-rstudio-server_29920945.out
rename to mgaughan-rstudio-server_29944433.out
index c0d31d1..69c10fd 100644
--- a/mgaughan-rstudio-server_29920945.out
+++ b/mgaughan-rstudio-server_29944433.out
@@ -1,17 +1,17 @@
 1. SSH tunnel from your workstation using the following command:
 
-   ssh -N -L 8787:n3439:53255 mjilg@klone.hyak.uw.edu
+   ssh -N -L 8787:n3441:52613 mjilg@klone.hyak.uw.edu
 
    and point your web browser to http://localhost:8787
 
 2. log in to RStudio Server using the following credentials:
 
    user: mjilg
-   password: eSK3QbcwgGpUya1wJIvC
+   password: YBcIVAgxBCfkvg2tbQqI
 
 When done using RStudio Server, terminate the job by:
 
 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
 2. Issue the following command on the login node:
 
-      scancel -f 29920945
+      scancel -f 29944433
diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R
index 56e14d4..e05c8da 100644
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@@ -1,9 +1,9 @@
 library(tidyverse)
 
-neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
+neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
 neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE) 
 
-neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
+neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
 neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE)
 
 
@@ -38,6 +38,10 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment"
 neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
 neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
 
+neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>%
+  mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
+                                  paste(neurobiber_description_pca_df$AuthorPHID,
+                                        neurobiber_description_pca_df$TaskPHID)))
 
 # look at correlation between PC1, PC2, and different outcome variables 
 library(dplyr)
@@ -53,29 +57,22 @@ discussion_anova_results
 
 # look at the representative comments for PC1 and PC2
 top5 <- neurobiber_subcomment_pca_df %>%
-  arrange(desc(PC6)) %>%
+  arrange(desc(PC2)) %>%
   slice(300:310) %>%
   pull(cleaned_comment)
 
 bottom5 <- neurobiber_subcomment_pca_df %>%
-  arrange(PC6) %>%
+  arrange(PC2) %>%
   slice(300:310) %>%
   pull(cleaned_comment)
 
-cat("Top 300:310 comment_text by PC1 score:\n")
+cat("Top 300:310 comment_text by PC2 score:\n")
 print(top5)
 
 cat("\nBottom 300:310 comment_text by PC1 score:\n")
 print(bottom5)
 
 
-aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
-  group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
-  summarise(mean_PC1 = median(PC1),
-            mean_PC2 = median(PC2),
-            mean_PC3 = median(PC3),
-            mean_PC4 = median(PC4),
-            mean_PC5 = median(PC5))
 library(scales)
 library(ggplot2)
 
@@ -85,23 +82,28 @@ affiliationColors <-
             ,c("False", "True"))
 
 
-long_df <- aggregated_neurobiber_description_pca_df %>%
-  tidyr::pivot_longer(
-    cols = starts_with("mean_PC"),
-    names_to = "PC",
-    values_to = "PC_value"
-  )
+neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>%
+  left_join(
+    neurobiber_description_pca_df %>%
+      select(TaskPHID, priority),
+    by = "TaskPHID"
+  ) |>
+  filter(priority.y %in% c("Lowest","Unbreak Now!"))
 
-unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
-unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
-ggplot(neurobiber_description_pca_df, aes(x = PC1, y = PC3, fill = closed_relevance)) +
-  geom_point(shape = 21, alpha=0.3, size=2) +
-  facet_grid(source ~ phase) +
+neurobiber_description_pca_df <- neurobiber_description_pca_df |>
+  filter(priority %in% c("Lowest","Unbreak Now!"))
+#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
+#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
+ggplot(neurobiber_description_pca_df, aes(x = PC3, y = PC8, fill = priority)) +
+  geom_point(shape = 21, alpha=0.4, size=2) +
+  facet_grid(source ~ phase, scales="fixed") +
+  xlim(-10, 10) + 
+  ylim(-10, 10) +
   scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
   theme_minimal() +
   labs(
-    title = "PCs for Task Subcomments (Faceted by Source and Phase)",
-    x = "PC1",
-    y = "PC3",
-    fill = "(tentative affiliation)"
+    title = "PCs for Task Descriptions (Faceted by Source and Phase)",
+    x = "PC3",
+    y = "PC8",
+    fill = "author_same_as_task_creator?"
   )