From acd8964e73073da8a2c8f7bcdf88e954f50ffaea Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Thu, 25 Sep 2025 14:09:39 -0700 Subject: [PATCH] preliminary EDA on the PCA analysis --- mgaughan-rstudio-server_29836350.out | 17 +++++++++ p2/quest/neurobiber_PCA_analysis.R | 54 ++++++++++++++++------------ 2 files changed, 48 insertions(+), 23 deletions(-) create mode 100644 mgaughan-rstudio-server_29836350.out diff --git a/mgaughan-rstudio-server_29836350.out b/mgaughan-rstudio-server_29836350.out new file mode 100644 index 0000000..be772df --- /dev/null +++ b/mgaughan-rstudio-server_29836350.out @@ -0,0 +1,17 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3439:44313 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: inZLmycTjFs4aqmFIMs1 + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 29836350 diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R index b547d17..de74d4b 100644 --- a/p2/quest/neurobiber_PCA_analysis.R +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -1,12 +1,19 @@ library(tidyverse) -neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv" +neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv" neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) -neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv" +neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv" neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) +# Add comment_type column to each df +neurobiber_description_pca_df$comment_type <- "task_description" +neurobiber_subcomment_pca_df$comment_type <- "subcomment" + +# Combine them +unified_df <- rbind(neurobiber_description_pca_df, neurobiber_subcomment_pca_df) + # look at correlation between PC1, PC2, and different outcome variables library(dplyr) description_anova_results <- neurobiber_description_pca_df %>% @@ -20,27 +27,25 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>% discussion_anova_results # look at the representative comments for PC1 and PC2 -top5 <- neurobiber_subcomment_pca_df %>% - filter(source=="c2") |> - arrange(desc(PC2)) %>% - slice(15:30) %>% +top5 <- neurobiber_description_pca_df %>% + arrange(desc(PC1)) %>% + slice(500:510) %>% pull(text) -bottom5 <- neurobiber_subcomment_pca_df %>% - filter(source=="c2") |> - arrange(PC2) %>% - slice(15:30) %>% +bottom5 <- neurobiber_description_pca_df %>% + arrange(PC1) %>% + slice(500:510) %>% pull(text) -cat("Top 15:30 comment_text by score:\n") +cat("Top 10:20 comment_text by score:\n") print(top5) -cat("\nBottom 15:30 comment_text by score:\n") +cat("\nBottom 10:20 comment_text by score:\n") print(bottom5) aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |> - group_by(AuthorWMFAffil, week_index, source, priority) %>% + group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>% summarise(mean_PC1 = median(PC1), mean_PC2 = median(PC2), mean_PC3 = median(PC3), @@ -62,13 +67,16 @@ long_df <- aggregated_neurobiber_description_pca_df %>% values_to = "PC_value" ) -ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) + - geom_line(size = 1) + - facet_grid(PC ~ source, scales = "free_y") + - scale_color_manual(values = affiliationColors, name = "WMF Affiliation") + - scale_x_continuous(breaks = pretty_breaks()) + - scale_y_continuous(limits = c(-10, 10)) + - labs(x = "Week Index", y = "Mean PC Value", - title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") + - theme_minimal(base_size = 14) + - theme(legend.position = "top") \ No newline at end of file +unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) +unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] +ggplot(unified_df, aes(x = PC3, y = PC4, fill = AuthorWMFAffil)) + + geom_point(shape = 21, alpha=0.3, size=2) + + facet_grid(source ~ phase) + + scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice + theme_minimal() + + labs( + title = "PCs for All Comments (Faceted by Source and Phase)", + x = "PC3", + y = "PC4", + fill = "Comment Type" + )