1
0

updating with dbscan clustering etc.

This commit is contained in:
Matthew Gaughan 2025-07-16 14:03:51 -07:00
parent 90e69975d2
commit 2e0665488c

View File

@ -24,32 +24,36 @@ table(neurobiber_df$source)
# as.factor(kmeans(X_sub, centers = 50)$cluster)
# }) %>%
# ungroup()
library(dbscan)
dbscan_result <- dbscan(X, eps = 0.5, minPts = 97)
neurobiber_df$dbcluster <- as.factor(ifelse(dbscan_result$cluster == -1, "Noise", dbscan_result$cluster))
kmeans_result <- kmeans(X, centers = 10)
neurobiber_df$cluster <- as.factor(kmeans_result$cluster)
table(neurobiber_df$cluster)
table(neurobiber_df$dbcluster)
pca <- prcomp(X, center = TRUE, scale. = TRUE)
neurobiber_df$PC1 <- pca$x[,1]
neurobiber_df$PC2 <- pca$x[,2]
ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = cluster)) +
ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = dbcluster)) +
geom_point(size = 2, alpha = 0.7) +
theme_minimal() +
labs(title = "Within case comment clusters (kmeans) by cross-case PCA",
labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA",
x = "Principal Component 1",
y = "Principal Component 2") +
facet_wrap(~ source)
ggplot(neurobiber_df, aes(x = phase, y=cluster, fill=AuthorWMFAffil)) +
ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) +
geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
theme_minimal() +
labs(title = "Across-case comment clusters by feature deployment phase",
x = "Feature deployment phase",
y = "Neurobiber feature vector cluster (kmeans)") +
y = "Neurobiber feature vector cluster (DBSCAN)") +
facet_wrap(~ source)
cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$cluster), FUN = mean)
cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$dbcluster), FUN = mean)
rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster)
cluster_means <- cluster_means[,-1] # Remove cluster label column
@ -75,6 +79,6 @@ colnames(cluster_means) <- BIBER_FEATURES
library(pheatmap)
pheatmap(cluster_means,
cluster_rows = FALSE, # Now features are clustered (rows)
cluster_cols = TRUE, # Clusters (columns) are not clustered
cluster_cols = FALSE, # Clusters (columns) are not clustered
scale = "row") # Standardize features