library(tidyverse) neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv" neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE) neurobiber_df$normalized_features_vec <- lapply( asplit(neurobiber_df[, normalized_cols], 1), as.numeric ) library(dplyr) # duplicate, declined, invalid -> declined # stalled, open, progress -> open # resolved -> resolved neurobiber_df <- neurobiber_df |> filter(comment_type == "task_description") |> mutate( task_status = case_when( status %in% c("duplicate", "declined", "invalid") ~ "declined", status %in% c("stalled", "open", "progress") ~ "open", status == "resolved" ~ "resolved", TRUE ~ status # fallback for unexpected values )) X <- do.call(rbind, neurobiber_df$normalized_features_vec) library(coop) #cos_sim1 <- coop::cosine(t(X)) register_means <- aggregate( X, by = list( priority = neurobiber_df$priority, outcome= neurobiber_df$task_status, phase = neurobiber_df$phase, source = neurobiber_df$source, affiliation = neurobiber_df$AuthorWMFAffil ), FUN = mean ) feature_mat <- as.matrix(register_means[, -(1:5)]) cos_sim_matrix <- coop::cosine(t(feature_mat)) rownames(cos_sim_matrix) <- apply(register_means[, 1:5], 1, paste, collapse = "_") colnames(cos_sim_matrix) <- rownames(cos_sim_matrix) scaled_mat <- scale(cos_sim_matrix) #pheatmap(scaled_mat, symm = TRUE) #heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257)) library(viridis) library(pheatmap) pheatmap(cos_sim_matrix, cluster_rows = FALSE, # Now features are clustered (rows) cluster_cols = FALSE, scale='none', color = viridis(100)) # Standardize featu diag(cos_sim_matrix) <- NA which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Least similar