updating similarity vectors

2025-07-29 13:38:50 -07:00 · 2025-07-29 13:38:50 -07:00 · c5966518ef
commit c5966518ef
parent 23ef7acd01
4 changed files with 71 additions and 0 deletions
--- a/mgaughan-rstudio-server_27815770.out
+++ b/mgaughan-rstudio-server_27815770.out
@ -0,0 +1,17 @@
+1. SSH tunnel from your workstation using the following command:
+
+   ssh -N -L 8787:n3439:41317 mjilg@klone.hyak.uw.edu
+
+   and point your web browser to http://localhost:8787
+
+2. log in to RStudio Server using the following credentials:
+
+   user: mjilg
+   password: yo0riOVPbQWPzplKhedd
+
+When done using RStudio Server, terminate the job by:
+
+1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
+2. Issue the following command on the login node:
+
+      scancel -f 27815770
--- a/p2/authorship_breakdown_cosine_similarity.png
+++ b/p2/authorship_breakdown_cosine_similarity.png
--- a/p2/outcome_similarity_vector.png
+++ b/p2/outcome_similarity_vector.png
--- a/p2/quest/neurobiber_cosine.R
+++ b/p2/quest/neurobiber_cosine.R
@ -0,0 +1,54 @@
+library(tidyverse)
+
+neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv"
+neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) 
+
+normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE)
+
+neurobiber_df$normalized_features_vec <- lapply(
+  asplit(neurobiber_df[, normalized_cols], 1), as.numeric
+)
+library(dplyr)
+neurobiber_df <- neurobiber_df |>
+  filter(comment_type == "task_description")
+
+X <- do.call(rbind, neurobiber_df$normalized_features_vec)
+
+library(coop)
+#cos_sim1 <- coop::cosine(t(X))
+
+register_means <- aggregate(
+  X,
+  by = list(
+    affiliation = neurobiber_df$AuthorWMFAffil,
+    outcome= neurobiber_df$status
+  ),
+  FUN = mean
+)
+
+feature_mat <- as.matrix(register_means[, -(1:2)])  
+cos_sim_matrix <- coop::cosine(t(feature_mat))
+rownames(cos_sim_matrix) <- apply(register_means[, 1:2], 1, paste, collapse = "_")
+colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
+
+
+scaled_mat <- scale(cos_sim_matrix)
+#pheatmap(scaled_mat, symm = TRUE)
+#heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257))
+library(pheatmap)
+pheatmap(cos_sim_matrix, 
+         register_rows = FALSE,    # Now features are clustered (rows)
+         register_cols = FALSE,
+         scale='none')         # Standardize featu
+
+library(reshape2)
+library(ggplot2)
+sim_df <- melt(cos_sim_matrix, na.rm = TRUE)
+ggplot(sim_df, aes(Var1, Var2, fill = value)) +
+  geom_tile() +
+  scale_fill_gradient2(low = "white", high = "red", mid = "blue", midpoint = 0.5, limit = c(0,1)) +
+  theme(axis.text.x = element_text(angle = 90, hjust = 1))
+
+diag(cos_sim_matrix) <- NA
+which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE)  # Most similar
+which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE)  # Least similar