From 2e0665488cd36123522382a6cea2ad997c20d9fd Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Wed, 16 Jul 2025 14:03:51 -0700 Subject: [PATCH] updating with dbscan clustering etc. --- p2/quest/neurobiber_EDA.R | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/p2/quest/neurobiber_EDA.R b/p2/quest/neurobiber_EDA.R index 92e7a98..01ce1ff 100644 --- a/p2/quest/neurobiber_EDA.R +++ b/p2/quest/neurobiber_EDA.R @@ -24,32 +24,36 @@ table(neurobiber_df$source) # as.factor(kmeans(X_sub, centers = 50)$cluster) # }) %>% # ungroup() +library(dbscan) +dbscan_result <- dbscan(X, eps = 0.5, minPts = 97) +neurobiber_df$dbcluster <- as.factor(ifelse(dbscan_result$cluster == -1, "Noise", dbscan_result$cluster)) + kmeans_result <- kmeans(X, centers = 10) neurobiber_df$cluster <- as.factor(kmeans_result$cluster) -table(neurobiber_df$cluster) +table(neurobiber_df$dbcluster) pca <- prcomp(X, center = TRUE, scale. = TRUE) neurobiber_df$PC1 <- pca$x[,1] neurobiber_df$PC2 <- pca$x[,2] -ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = cluster)) + +ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = dbcluster)) + geom_point(size = 2, alpha = 0.7) + theme_minimal() + - labs(title = "Within case comment clusters (kmeans) by cross-case PCA", + labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA", x = "Principal Component 1", y = "Principal Component 2") + facet_wrap(~ source) -ggplot(neurobiber_df, aes(x = phase, y=cluster, fill=AuthorWMFAffil)) + +ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + theme_minimal() + labs(title = "Across-case comment clusters by feature deployment phase", x = "Feature deployment phase", - y = "Neurobiber feature vector cluster (kmeans)") + + y = "Neurobiber feature vector cluster (DBSCAN)") + facet_wrap(~ source) -cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$cluster), FUN = mean) +cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$dbcluster), FUN = mean) rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster) cluster_means <- cluster_means[,-1] # Remove cluster label column @@ -75,6 +79,6 @@ colnames(cluster_means) <- BIBER_FEATURES library(pheatmap) pheatmap(cluster_means, cluster_rows = FALSE, # Now features are clustered (rows) - cluster_cols = TRUE, # Clusters (columns) are not clustered + cluster_cols = FALSE, # Clusters (columns) are not clustered scale = "row") # Standardize features