updating with dbscan clustering etc.

2025-07-16 14:03:51 -07:00 · 2025-07-16 14:03:51 -07:00 · 2e0665488c
commit 2e0665488c
parent 90e69975d2
1 changed files with 11 additions and 7 deletions
--- a/p2/quest/neurobiber_EDA.R
+++ b/p2/quest/neurobiber_EDA.R
@ -24,32 +24,36 @@ table(neurobiber_df$source)
 #    as.factor(kmeans(X_sub, centers = 50)$cluster)
 #  }) %>%
 #  ungroup()
+library(dbscan)
+dbscan_result <- dbscan(X, eps = 0.5, minPts = 97)
+neurobiber_df$dbcluster <- as.factor(ifelse(dbscan_result$cluster == -1, "Noise", dbscan_result$cluster))
+
 kmeans_result <- kmeans(X, centers = 10)
 neurobiber_df$cluster <- as.factor(kmeans_result$cluster)
-table(neurobiber_df$cluster)
+table(neurobiber_df$dbcluster)

 pca <- prcomp(X, center = TRUE, scale. = TRUE)
 neurobiber_df$PC1 <- pca$x[,1]
 neurobiber_df$PC2 <- pca$x[,2]


-ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = cluster)) +
+ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = dbcluster)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_minimal() +
-  labs(title = "Within case comment clusters (kmeans) by cross-case PCA",
+  labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  facet_wrap(~ source)

-ggplot(neurobiber_df, aes(x = phase, y=cluster, fill=AuthorWMFAffil)) +
+ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) +
  geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
  theme_minimal() +
  labs(title = "Across-case comment clusters by feature deployment phase",
       x = "Feature deployment phase",
-       y = "Neurobiber feature vector cluster (kmeans)") +
+       y = "Neurobiber feature vector cluster (DBSCAN)") +
  facet_wrap(~ source)

-cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$cluster), FUN = mean)
+cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$dbcluster), FUN = mean)
 rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster)
 cluster_means <- cluster_means[,-1] # Remove cluster label column

@ -75,6 +79,6 @@ colnames(cluster_means) <- BIBER_FEATURES
 library(pheatmap)
 pheatmap(cluster_means, 
         cluster_rows = FALSE,    # Now features are clustered (rows)
-         cluster_cols = TRUE,   # Clusters (columns) are not clustered
+         cluster_cols = FALSE,   # Clusters (columns) are not clustered
         scale = "row")         # Standardize features