adding in analysis of biberplus vectors

2025-07-23 14:22:20 -07:00 · 2025-07-23 14:22:20 -07:00 · a08a49d04e
commit a08a49d04e
parent b0584ec1be
1 changed files with 34 additions and 14 deletions
--- a/p2/quest/neurobiber_EDA.R
+++ b/p2/quest/neurobiber_EDA.R
@ -1,15 +1,16 @@
 library(tidyverse)

-neurobiber_csv <-"~/p2/quest/071525_neurobiber_labels.csv"
+neurobiber_csv <-"~/p2/quest/072325_biberplus_labels.csv"
 neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) 

-neurobiber_df$features_vec  <- lapply(neurobiber_df$neurobiber_preds, function(x) {
-  x <- gsub("\\[|\\]", "", x)
-  x <- trimws(x)
-  as.numeric(unlist(strsplit(x, "\\s+")))
-})
+normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE)
+
+neurobiber_df$normalized_features_vec <- lapply(
+  asplit(neurobiber_df[, normalized_cols], 1), as.numeric
+)
+
+X <- do.call(rbind, neurobiber_df$normalized_features_vec)

-X <- do.call(rbind, neurobiber_df$features_vec )

 set.seed(808)

@ -37,7 +38,7 @@ neurobiber_df$PC1 <- pca$x[,1]
 neurobiber_df$PC2 <- pca$x[,2]


-ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = dbcluster)) +
+ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = phase)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_minimal() +
  labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA",
@ -53,9 +54,27 @@ ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) +
       y = "Neurobiber feature vector cluster (DBSCAN)") +
  facet_wrap(~ source)

-cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$dbcluster), FUN = mean)
-rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster)
-cluster_means <- cluster_means[,-1] # Remove cluster label column
+cluster_means <- aggregate(
+  X,
+  by = list(
+    WMFAffil = neurobiber_df$AuthorWMFAffil,
+    phase = neurobiber_df$phase,
+    comment_type = neurobiber_df$comment_type,
+    source= neurobiber_df$source
+  ),
+  FUN = mean
+)
+
+rownames(cluster_means) <- apply(
+  cluster_means[, c("WMFAffil", "phase", "comment_type", "source")], 1,
+  function(x) paste(x, collapse = "_")
+)
+
+cluster_means <- cluster_means[, !(names(cluster_means) %in% c("WMFAffil", "phase", "comment_type", "source"))]
+
+#cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$AuthorWMFAffil), FUN = mean)
+#rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster)
+#cluster_means <- cluster_means[,-1] # Remove cluster label column

 BIBER_FEATURES <- c(
  "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ",
@ -75,10 +94,11 @@ BIBER_FEATURES <- c(
  "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH",
  "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X"
 )
-colnames(cluster_means) <- BIBER_FEATURES
+BIBER_FEATURES_NO_BIN <- gsub("^BIN_", "", BIBER_FEATURES)
+colnames(cluster_means) <- BIBER_FEATURES_NO_BIN
 library(pheatmap)
 pheatmap(cluster_means, 
         cluster_rows = FALSE,    # Now features are clustered (rows)
-         cluster_cols = FALSE,   # Clusters (columns) are not clustered
-         scale = "row")         # Standardize features
+         cluster_cols = FALSE,
+         scale='none')         # Standardize features