1
0

updating with dbscan clustering etc.

This commit is contained in:
Matthew Gaughan 2025-07-16 14:03:51 -07:00
parent 90e69975d2
commit 2e0665488c

View File

@ -24,32 +24,36 @@ table(neurobiber_df$source)
# as.factor(kmeans(X_sub, centers = 50)$cluster) # as.factor(kmeans(X_sub, centers = 50)$cluster)
# }) %>% # }) %>%
# ungroup() # ungroup()
library(dbscan)
dbscan_result <- dbscan(X, eps = 0.5, minPts = 97)
neurobiber_df$dbcluster <- as.factor(ifelse(dbscan_result$cluster == -1, "Noise", dbscan_result$cluster))
kmeans_result <- kmeans(X, centers = 10) kmeans_result <- kmeans(X, centers = 10)
neurobiber_df$cluster <- as.factor(kmeans_result$cluster) neurobiber_df$cluster <- as.factor(kmeans_result$cluster)
table(neurobiber_df$cluster) table(neurobiber_df$dbcluster)
pca <- prcomp(X, center = TRUE, scale. = TRUE) pca <- prcomp(X, center = TRUE, scale. = TRUE)
neurobiber_df$PC1 <- pca$x[,1] neurobiber_df$PC1 <- pca$x[,1]
neurobiber_df$PC2 <- pca$x[,2] neurobiber_df$PC2 <- pca$x[,2]
ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = cluster)) + ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = dbcluster)) +
geom_point(size = 2, alpha = 0.7) + geom_point(size = 2, alpha = 0.7) +
theme_minimal() + theme_minimal() +
labs(title = "Within case comment clusters (kmeans) by cross-case PCA", labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA",
x = "Principal Component 1", x = "Principal Component 1",
y = "Principal Component 2") + y = "Principal Component 2") +
facet_wrap(~ source) facet_wrap(~ source)
ggplot(neurobiber_df, aes(x = phase, y=cluster, fill=AuthorWMFAffil)) + ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) +
geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) +
theme_minimal() + theme_minimal() +
labs(title = "Across-case comment clusters by feature deployment phase", labs(title = "Across-case comment clusters by feature deployment phase",
x = "Feature deployment phase", x = "Feature deployment phase",
y = "Neurobiber feature vector cluster (kmeans)") + y = "Neurobiber feature vector cluster (DBSCAN)") +
facet_wrap(~ source) facet_wrap(~ source)
cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$cluster), FUN = mean) cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$dbcluster), FUN = mean)
rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster) rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster)
cluster_means <- cluster_means[,-1] # Remove cluster label column cluster_means <- cluster_means[,-1] # Remove cluster label column
@ -75,6 +79,6 @@ colnames(cluster_means) <- BIBER_FEATURES
library(pheatmap) library(pheatmap)
pheatmap(cluster_means, pheatmap(cluster_means,
cluster_rows = FALSE, # Now features are clustered (rows) cluster_rows = FALSE, # Now features are clustered (rows)
cluster_cols = TRUE, # Clusters (columns) are not clustered cluster_cols = FALSE, # Clusters (columns) are not clustered
scale = "row") # Standardize features scale = "row") # Standardize features