some neurobiber PCA analysis

2025-09-05 14:59:07 -07:00 · 2025-09-05 14:59:07 -07:00 · 6de62f2447
commit 6de62f2447
parent a96fd6db2f
3 changed files with 92 additions and 147 deletions
--- a/mgaughan-rstudio-server_28911380.out
+++ b/mgaughan-rstudio-server_28911380.out
@ -0,0 +1,18 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: 9Qgk9UkRdmKalTKyDmH4
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 28911380
+[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -0,0 +1,74 @@
+library(tidyverse)
+
+neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv"
+neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE) 
+
+neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv"
+neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE)
+
+
+# look at correlation between PC1, PC2, and different outcome variables 
+library(dplyr)
+description_anova_results <- neurobiber_description_pca_df %>%
+  group_by(source) %>%
+  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
+description_anova_results
+
+discussion_anova_results <- neurobiber_subcomment_pca_df %>%
+  group_by(source) %>%
+  group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
+discussion_anova_results
+
+# look at the representative comments for PC1 and PC2
+top5 <- neurobiber_subcomment_pca_df %>%
+  filter(source=="c2") |>
+  arrange(desc(PC2)) %>%
+  slice(15:30) %>%
+  pull(text)
+
+bottom5 <- neurobiber_subcomment_pca_df %>%
+  filter(source=="c2") |>
+  arrange(PC2) %>%
+  slice(15:30) %>%
+  pull(text)
+
+cat("Top 15:30 comment_text by score:\n")
+print(top5)
+
+cat("\nBottom 15:30 comment_text by score:\n")
+print(bottom5)
+
+
+aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
+  group_by(AuthorWMFAffil, week_index, source, priority) %>%
+  summarise(mean_PC1 = median(PC1),
+            mean_PC2 = median(PC2),
+            mean_PC3 = median(PC3),
+            mean_PC4 = median(PC4),
+            mean_PC5 = median(PC5))
+library(scales)
+library(ggplot2)
+
+
+affiliationColors <-
+  setNames( c('#5da2d8', '#c7756a')
+            ,c("False", "True"))
+
+
+long_df <- aggregated_neurobiber_description_pca_df %>%
+  tidyr::pivot_longer(
+    cols = starts_with("mean_PC"),
+    names_to = "PC",
+    values_to = "PC_value"
+  )
+
+ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) +
+  geom_line(size = 1) +
+  facet_grid(PC ~ source, scales = "free_y") +
+  scale_color_manual(values = affiliationColors, name = "WMF Affiliation") +
+  scale_x_continuous(breaks = pretty_breaks()) +
+  scale_y_continuous(limits = c(-10, 10)) + 
+  labs(x = "Week Index", y = "Mean PC Value",
+       title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") +
+  theme_minimal(base_size = 14) +
+  theme(legend.position = "top")
--- a/p2/quest/neurobiber_cosine.R
+++ b/p2/quest/neurobiber_cosine.R
@ -1,147 +0,0 @@
-library(tidyverse)
-
-neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv"
-neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) 
-
-normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE)
-
-neurobiber_df$normalized_features_vec <- lapply(
-  asplit(neurobiber_df[, normalized_cols], 1), as.numeric
-)
-library(dplyr)
-# duplicate, declined, invalid -> declined 
-# stalled, open, progress -> open
-# resolved -> resolved
-neurobiber_df <- neurobiber_df |>
-  filter(comment_type == "task_description") |>
-  mutate(
-    task_status = case_when(
-      status %in% c("duplicate", "declined", "invalid") ~ "declined",
-      status %in% c("stalled", "open", "progress") ~ "open",
-      status == "resolved" ~ "resolved",
-      TRUE ~ status  # fallback for unexpected values
-    ))
-
-X <- do.call(rbind, neurobiber_df$normalized_features_vec)
-
-library(coop)
-#cos_sim1 <- coop::cosine(t(X))
-
-
-register_means <- aggregate(
-  X,
-  by = list(
-    outcome= neurobiber_df$task_status,
-    source = neurobiber_df$source,
-    affiliation = neurobiber_df$AuthorWMFAffil
-  ),
-  FUN = mean
-)
-
-feature_mat <- as.matrix(register_means[, -(1:3)])  
-cos_sim_matrix <- coop::cosine(t(feature_mat))
-rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_")
-colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
-
-#finding the most dissimilar pairs 
-
-compare_feature_vectors <- function(
-    pair1, pair2, 
-    cos_sim_matrix, 
-    feature_mat, 
-    normalized_cols, 
-    top_n = 5
-) {
-  # Allow for both index and name input
-  if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1
-  if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2
-  
-  # Get feature vectors
-  vec1 <- feature_mat[row_idx, ]
-  vec2 <- feature_mat[col_idx, ]
-  
-  # Feature-wise absolute differences
-  feature_diff <- abs(vec1 - vec2)
-  top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n]
-  top_features <- names(feature_diff)[top_features_idx]
-  top_diffs <- feature_diff[top_features_idx]
-  
-  # Map Vxx to normalized column names
-  feature_nums <- as.integer(sub("V", "", top_features))
-  feature_colnames <- normalized_cols[feature_nums]
-  
-  # Determine which vector is larger for each feature
-  larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx],
-                      rownames(cos_sim_matrix)[row_idx],
-                      colnames(cos_sim_matrix)[col_idx])
-  
-  # Assemble results
-  top_features_df <- data.frame(
-    feature = top_features,
-    normalized_colname = feature_colnames,
-    vec1_value = vec1[top_features_idx],
-    vec2_value = vec2[top_features_idx],
-    abs_difference = top_diffs,
-    larger_in = larger_in
-  )
-  
-  # Print pair and return
-  cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n")
-  print(top_features_df)
-  invisible(top_features_df)
-}
-
-compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10)
-
-
-
-#plotting stuff beneath here
-annotation_row <- data.frame(
-  affiliation = register_means$affiliation,
-  source = register_means$source
-)
-rownames(annotation_row) <- rownames(cos_sim_matrix)
-
-annotation_col <- data.frame(
-  affiliation = register_means$affiliation,
-  source = register_means$source
-)
-rownames(annotation_col) <- colnames(cos_sim_matrix)
-
-annotation_row <- annotation_row |>
-  mutate(affil = case_when(
-       affiliation == "True" ~ "WMF",
-       affiliation == "False" ~ "non-WMF" 
-  )) |> select(-affiliation)
-
-annotation_col <- annotation_col |>
-  mutate(affil = case_when(
-    affiliation == "True" ~ "WMF",
-    affiliation == "False" ~ "non-WMF" 
-  )) |> select(-affiliation)
-  
-
-my_annotation_colors = list(
-  affil = c("WMF" = "green", "non-WMF" = "purple"),
-  source = c(c1 = "lightgrey", c2 = "grey", c3 = "black")
-)
-
-cos_sim_matrix[lower.tri(cos_sim_matrix)] <- NA
-#pheatmap(scaled_mat, symm = TRUE)
-#heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257))
-library(viridis)
-library(pheatmap)
-fossy_heatmap <- pheatmap(cos_sim_matrix,
-           cluster_rows = FALSE,    
-           cluster_cols = FALSE,
-           scale='none', 
-           annotation_row = annotation_row,
-           annotation_col = annotation_col,
-           annotation_colors = my_annotation_colors,
-           na_col = "white")         
-
-#ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800)
-
-#diag(cos_sim_matrix) <- NA
-#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE)  # Most similar
-#which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE)  # Least similar