From 7f89fd1966628435e641e5e9c8d9d008768b9263 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Wed, 1 Oct 2025 20:58:55 -0700 Subject: [PATCH] updated PCA analysis, ready for rob tomorrow --- dsl/human_sampling.R | 23 ++++++++ ...ut => mgaughan-rstudio-server_29944433.out | 6 +- p2/quest/neurobiber_PCA_analysis.R | 56 ++++++++++--------- 3 files changed, 55 insertions(+), 30 deletions(-) create mode 100644 dsl/human_sampling.R rename mgaughan-rstudio-server_29920945.out => mgaughan-rstudio-server_29944433.out (77%) diff --git a/dsl/human_sampling.R b/dsl/human_sampling.R new file mode 100644 index 0000000..1d3b9e9 --- /dev/null +++ b/dsl/human_sampling.R @@ -0,0 +1,23 @@ +library(tidyverse) + +main_csv <-"~/analysis_data/092925_unified_phab.csv" +main_df <- read.csv(main_csv, header = TRUE) + +set.seed(123) # For reproducibility + +sampled_df <- main_df %>% + group_by(source) %>% + mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>% + ungroup() %>% + filter(sampled_TaskPHID) %>% + select(-sampled_TaskPHID) + +sentence_level_sample <- sampled_df |> + mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |> + unnest(cleaned_sentences)|> + filter(cleaned_sentences != ", ") |> + select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit")) + +(nrow(sentence_level_sample) / 293) * 1.5 + +#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE) \ No newline at end of file diff --git a/mgaughan-rstudio-server_29920945.out b/mgaughan-rstudio-server_29944433.out similarity index 77% rename from mgaughan-rstudio-server_29920945.out rename to mgaughan-rstudio-server_29944433.out index c0d31d1..69c10fd 100644 --- a/mgaughan-rstudio-server_29920945.out +++ b/mgaughan-rstudio-server_29944433.out @@ -1,17 +1,17 @@ 1. SSH tunnel from your workstation using the following command: - ssh -N -L 8787:n3439:53255 mjilg@klone.hyak.uw.edu + ssh -N -L 8787:n3441:52613 mjilg@klone.hyak.uw.edu and point your web browser to http://localhost:8787 2. log in to RStudio Server using the following credentials: user: mjilg - password: eSK3QbcwgGpUya1wJIvC + password: YBcIVAgxBCfkvg2tbQqI When done using RStudio Server, terminate the job by: 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 2. Issue the following command on the login node: - scancel -f 29920945 + scancel -f 29944433 diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R index 56e14d4..e05c8da 100644 --- a/p2/quest/neurobiber_PCA_analysis.R +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -1,9 +1,9 @@ library(tidyverse) -neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv" +neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv" neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) -neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv" +neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv" neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) @@ -38,6 +38,10 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment" neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment) neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment) +neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>% + mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in% + paste(neurobiber_description_pca_df$AuthorPHID, + neurobiber_description_pca_df$TaskPHID))) # look at correlation between PC1, PC2, and different outcome variables library(dplyr) @@ -53,29 +57,22 @@ discussion_anova_results # look at the representative comments for PC1 and PC2 top5 <- neurobiber_subcomment_pca_df %>% - arrange(desc(PC6)) %>% + arrange(desc(PC2)) %>% slice(300:310) %>% pull(cleaned_comment) bottom5 <- neurobiber_subcomment_pca_df %>% - arrange(PC6) %>% + arrange(PC2) %>% slice(300:310) %>% pull(cleaned_comment) -cat("Top 300:310 comment_text by PC1 score:\n") +cat("Top 300:310 comment_text by PC2 score:\n") print(top5) cat("\nBottom 300:310 comment_text by PC1 score:\n") print(bottom5) -aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |> - group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>% - summarise(mean_PC1 = median(PC1), - mean_PC2 = median(PC2), - mean_PC3 = median(PC3), - mean_PC4 = median(PC4), - mean_PC5 = median(PC5)) library(scales) library(ggplot2) @@ -85,23 +82,28 @@ affiliationColors <- ,c("False", "True")) -long_df <- aggregated_neurobiber_description_pca_df %>% - tidyr::pivot_longer( - cols = starts_with("mean_PC"), - names_to = "PC", - values_to = "PC_value" - ) +neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>% + left_join( + neurobiber_description_pca_df %>% + select(TaskPHID, priority), + by = "TaskPHID" + ) |> + filter(priority.y %in% c("Lowest","Unbreak Now!")) -unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) -unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] -ggplot(neurobiber_description_pca_df, aes(x = PC1, y = PC3, fill = closed_relevance)) + - geom_point(shape = 21, alpha=0.3, size=2) + - facet_grid(source ~ phase) + +neurobiber_description_pca_df <- neurobiber_description_pca_df |> + filter(priority %in% c("Lowest","Unbreak Now!")) +#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) +#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] +ggplot(neurobiber_description_pca_df, aes(x = PC3, y = PC8, fill = priority)) + + geom_point(shape = 21, alpha=0.4, size=2) + + facet_grid(source ~ phase, scales="fixed") + + xlim(-10, 10) + + ylim(-10, 10) + scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice theme_minimal() + labs( - title = "PCs for Task Subcomments (Faceted by Source and Phase)", - x = "PC1", - y = "PC3", - fill = "(tentative affiliation)" + title = "PCs for Task Descriptions (Faceted by Source and Phase)", + x = "PC3", + y = "PC8", + fill = "author_same_as_task_creator?" )