library(tidyverse)

neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv"
neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) 

normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE)

neurobiber_df$normalized_features_vec <- lapply(
  asplit(neurobiber_df[, normalized_cols], 1), as.numeric
)
library(dplyr)
# duplicate, declined, invalid -> declined 
# stalled, open, progress -> open
# resolved -> resolved
neurobiber_df <- neurobiber_df |>
  filter(comment_type == "task_description") |>
  mutate(
    task_status = case_when(
      status %in% c("duplicate", "declined", "invalid") ~ "declined",
      status %in% c("stalled", "open", "progress") ~ "open",
      status == "resolved" ~ "resolved",
      TRUE ~ status  # fallback for unexpected values
    ))

X <- do.call(rbind, neurobiber_df$normalized_features_vec)

library(coop)
#cos_sim1 <- coop::cosine(t(X))


register_means <- aggregate(
  X,
  by = list(
    outcome= neurobiber_df$task_status,
    source = neurobiber_df$source,
    affiliation = neurobiber_df$AuthorWMFAffil
  ),
  FUN = mean
)

feature_mat <- as.matrix(register_means[, -(1:3)])  
cos_sim_matrix <- coop::cosine(t(feature_mat))
rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_")
colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)

#finding the most dissimilar pairs 

compare_feature_vectors <- function(
    pair1, pair2, 
    cos_sim_matrix, 
    feature_mat, 
    normalized_cols, 
    top_n = 5
) {
  # Allow for both index and name input
  if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1
  if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2
  
  # Get feature vectors
  vec1 <- feature_mat[row_idx, ]
  vec2 <- feature_mat[col_idx, ]
  
  # Feature-wise absolute differences
  feature_diff <- abs(vec1 - vec2)
  top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n]
  top_features <- names(feature_diff)[top_features_idx]
  top_diffs <- feature_diff[top_features_idx]
  
  # Map Vxx to normalized column names
  feature_nums <- as.integer(sub("V", "", top_features))
  feature_colnames <- normalized_cols[feature_nums]
  
  # Determine which vector is larger for each feature
  larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx],
                      rownames(cos_sim_matrix)[row_idx],
                      colnames(cos_sim_matrix)[col_idx])
  
  # Assemble results
  top_features_df <- data.frame(
    feature = top_features,
    normalized_colname = feature_colnames,
    vec1_value = vec1[top_features_idx],
    vec2_value = vec2[top_features_idx],
    abs_difference = top_diffs,
    larger_in = larger_in
  )
  
  # Print pair and return
  cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n")
  print(top_features_df)
  invisible(top_features_df)
}

compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10)


#plotting stuff beneath here
annotation_row <- data.frame(
  affiliation = register_means$affiliation,
  source = register_means$source
)
rownames(annotation_row) <- rownames(cos_sim_matrix)

annotation_col <- data.frame(
  affiliation = register_means$affiliation,
  source = register_means$source
)
rownames(annotation_col) <- colnames(cos_sim_matrix)

annotation_row <- annotation_row |>
  mutate(affil = case_when(
       affiliation == "True" ~ "WMF",
       affiliation == "False" ~ "non-WMF" 
  )) |> select(-affiliation)

annotation_col <- annotation_col |>
  mutate(affil = case_when(
    affiliation == "True" ~ "WMF",
    affiliation == "False" ~ "non-WMF" 
  )) |> select(-affiliation)
  

my_annotation_colors = list(
  affil = c("WMF" = "green", "non-WMF" = "purple"),
  source = c(c1 = "lightgrey", c2 = "grey", c3 = "black")
)

cos_sim_matrix[lower.tri(cos_sim_matrix)] <- NA
#pheatmap(scaled_mat, symm = TRUE)
#heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257))
library(viridis)
library(pheatmap)
fossy_heatmap <- pheatmap(cos_sim_matrix,
           cluster_rows = FALSE,    
           cluster_cols = FALSE,
           scale='none', 
           annotation_row = annotation_row,
           annotation_col = annotation_col,
           annotation_colors = my_annotation_colors,
           na_col = "white")         

#ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800)

#diag(cos_sim_matrix) <- NA
#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE)  # Most similar
#which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE)  # Least similar