1
0
mw-lifecycle-analysis/p2/quest/neurobiber_cosine.R
2025-07-29 13:38:50 -07:00

55 lines
1.7 KiB
R

library(tidyverse)
neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv"
neurobiber_df <- read.csv(neurobiber_csv , header = TRUE)
normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE)
neurobiber_df$normalized_features_vec <- lapply(
asplit(neurobiber_df[, normalized_cols], 1), as.numeric
)
library(dplyr)
neurobiber_df <- neurobiber_df |>
filter(comment_type == "task_description")
X <- do.call(rbind, neurobiber_df$normalized_features_vec)
library(coop)
#cos_sim1 <- coop::cosine(t(X))
register_means <- aggregate(
X,
by = list(
affiliation = neurobiber_df$AuthorWMFAffil,
outcome= neurobiber_df$status
),
FUN = mean
)
feature_mat <- as.matrix(register_means[, -(1:2)])
cos_sim_matrix <- coop::cosine(t(feature_mat))
rownames(cos_sim_matrix) <- apply(register_means[, 1:2], 1, paste, collapse = "_")
colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
scaled_mat <- scale(cos_sim_matrix)
#pheatmap(scaled_mat, symm = TRUE)
#heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257))
library(pheatmap)
pheatmap(cos_sim_matrix,
register_rows = FALSE, # Now features are clustered (rows)
register_cols = FALSE,
scale='none') # Standardize featu
library(reshape2)
library(ggplot2)
sim_df <- melt(cos_sim_matrix, na.rm = TRUE)
ggplot(sim_df, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "white", high = "red", mid = "blue", midpoint = 0.5, limit = c(0,1)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
diag(cos_sim_matrix) <- NA
which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar
which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Least similar