library(tidyverse) neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv" neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE) neurobiber_df$normalized_features_vec <- lapply( asplit(neurobiber_df[, normalized_cols], 1), as.numeric ) library(dplyr) # duplicate, declined, invalid -> declined # stalled, open, progress -> open # resolved -> resolved neurobiber_df <- neurobiber_df |> filter(comment_type == "task_description") |> mutate( task_status = case_when( status %in% c("duplicate", "declined", "invalid") ~ "declined", status %in% c("stalled", "open", "progress") ~ "open", status == "resolved" ~ "resolved", TRUE ~ status # fallback for unexpected values )) X <- do.call(rbind, neurobiber_df$normalized_features_vec) library(coop) #cos_sim1 <- coop::cosine(t(X)) register_means <- aggregate( X, by = list( outcome= neurobiber_df$task_status, source = neurobiber_df$source, affiliation = neurobiber_df$AuthorWMFAffil ), FUN = mean ) feature_mat <- as.matrix(register_means[, -(1:3)]) cos_sim_matrix <- coop::cosine(t(feature_mat)) rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_") colnames(cos_sim_matrix) <- rownames(cos_sim_matrix) #finding the most dissimilar pairs compare_feature_vectors <- function( pair1, pair2, cos_sim_matrix, feature_mat, normalized_cols, top_n = 5 ) { # Allow for both index and name input if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1 if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2 # Get feature vectors vec1 <- feature_mat[row_idx, ] vec2 <- feature_mat[col_idx, ] # Feature-wise absolute differences feature_diff <- abs(vec1 - vec2) top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n] top_features <- names(feature_diff)[top_features_idx] top_diffs <- feature_diff[top_features_idx] # Map Vxx to normalized column names feature_nums <- as.integer(sub("V", "", top_features)) feature_colnames <- normalized_cols[feature_nums] # Determine which vector is larger for each feature larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx], rownames(cos_sim_matrix)[row_idx], colnames(cos_sim_matrix)[col_idx]) # Assemble results top_features_df <- data.frame( feature = top_features, normalized_colname = feature_colnames, vec1_value = vec1[top_features_idx], vec2_value = vec2[top_features_idx], abs_difference = top_diffs, larger_in = larger_in ) # Print pair and return cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n") print(top_features_df) invisible(top_features_df) } compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10) #plotting stuff beneath here annotation_row <- data.frame( affiliation = register_means$affiliation, source = register_means$source ) rownames(annotation_row) <- rownames(cos_sim_matrix) annotation_col <- data.frame( affiliation = register_means$affiliation, source = register_means$source ) rownames(annotation_col) <- colnames(cos_sim_matrix) annotation_row <- annotation_row |> mutate(affil = case_when( affiliation == "True" ~ "WMF", affiliation == "False" ~ "non-WMF" )) |> select(-affiliation) annotation_col <- annotation_col |> mutate(affil = case_when( affiliation == "True" ~ "WMF", affiliation == "False" ~ "non-WMF" )) |> select(-affiliation) my_annotation_colors = list( affil = c("WMF" = "green", "non-WMF" = "purple"), source = c(c1 = "lightgrey", c2 = "grey", c3 = "black") ) cos_sim_matrix[lower.tri(cos_sim_matrix)] <- NA #pheatmap(scaled_mat, symm = TRUE) #heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257)) library(viridis) library(pheatmap) fossy_heatmap <- pheatmap(cos_sim_matrix, cluster_rows = FALSE, cluster_cols = FALSE, scale='none', annotation_row = annotation_row, annotation_col = annotation_col, annotation_colors = my_annotation_colors, na_col = "white") #ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800) #diag(cos_sim_matrix) <- NA #which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar #which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Least similar