preliminary EDA on the PCA analysis

2025-09-25 14:09:39 -07:00 · 2025-09-25 14:09:39 -07:00 · acd8964e73
commit acd8964e73
parent b21ecb02c3
2 changed files with 48 additions and 23 deletions
--- a/mgaughan-rstudio-server_29836350.out
+++ b/mgaughan-rstudio-server_29836350.out
@ -0,0 +1,17 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3439:44313 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: inZLmycTjFs4aqmFIMs1
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 29836350
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -1,12 +1,19 @@
 library(tidyverse)

-neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv"
+neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
 neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE) 

-neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv"
+neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
 neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE)


+# Add comment_type column to each df
+neurobiber_description_pca_df$comment_type <- "task_description"
+neurobiber_subcomment_pca_df$comment_type <- "subcomment"
+
+# Combine them
+unified_df <- rbind(neurobiber_description_pca_df, neurobiber_subcomment_pca_df)
+
 # look at correlation between PC1, PC2, and different outcome variables 
 library(dplyr)
 description_anova_results <- neurobiber_description_pca_df %>%
@ -20,27 +27,25 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>%
 discussion_anova_results

 # look at the representative comments for PC1 and PC2
-top5 <- neurobiber_subcomment_pca_df %>%
-  filter(source=="c2") |>
-  arrange(desc(PC2)) %>%
-  slice(15:30) %>%
+top5 <- neurobiber_description_pca_df %>%
+  arrange(desc(PC1)) %>%
+  slice(500:510) %>%
  pull(text)

-bottom5 <- neurobiber_subcomment_pca_df %>%
-  filter(source=="c2") |>
-  arrange(PC2) %>%
-  slice(15:30) %>%
+bottom5 <- neurobiber_description_pca_df %>%
+  arrange(PC1) %>%
+  slice(500:510) %>%
  pull(text)

-cat("Top 15:30 comment_text by score:\n")
+cat("Top 10:20 comment_text by score:\n")
 print(top5)

-cat("\nBottom 15:30 comment_text by score:\n")
+cat("\nBottom 10:20 comment_text by score:\n")
 print(bottom5)


 aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
-  group_by(AuthorWMFAffil, week_index, source, priority) %>%
+  group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
  summarise(mean_PC1 = median(PC1),
            mean_PC2 = median(PC2),
            mean_PC3 = median(PC3),
@ -62,13 +67,16 @@ long_df <- aggregated_neurobiber_description_pca_df %>%
    values_to = "PC_value"
  )

-ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) +
-  geom_line(size = 1) +
-  facet_grid(PC ~ source, scales = "free_y") +
-  scale_color_manual(values = affiliationColors, name = "WMF Affiliation") +
-  scale_x_continuous(breaks = pretty_breaks()) +
-  scale_y_continuous(limits = c(-10, 10)) + 
-  labs(x = "Week Index", y = "Mean PC Value",
-       title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") +
-  theme_minimal(base_size = 14) +
-  theme(legend.position = "top")
+unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
+unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
+ggplot(unified_df, aes(x = PC3, y = PC4, fill = AuthorWMFAffil)) +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  facet_grid(source ~ phase) +
+  scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
+  theme_minimal() +
+  labs(
+    title = "PCs for All Comments (Faceted by Source and Phase)",
+    x = "PC3",
+    y = "PC4",
+    fill = "Comment Type"
+  )