library(tidyverse) neurobiber_csv <-"~/p2/quest/072325_biberplus_labels.csv" neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE) neurobiber_df$normalized_features_vec <- lapply( asplit(neurobiber_df[, normalized_cols], 1), as.numeric ) X <- do.call(rbind, neurobiber_df$normalized_features_vec) set.seed(808) library(dplyr) library(purrr) table(neurobiber_df$source) #neurobiber_df <- neurobiber_df %>% # group_by(source) %>% # mutate(cluster = { # X_sub <- do.call(rbind, features_vec) # as.factor(kmeans(X_sub, centers = 50)$cluster) # }) %>% # ungroup() library(dbscan) dbscan_result <- dbscan(X, eps = 0.5, minPts = 97) neurobiber_df$dbcluster <- as.factor(ifelse(dbscan_result$cluster == -1, "Noise", dbscan_result$cluster)) kmeans_result <- kmeans(X, centers = 10) neurobiber_df$cluster <- as.factor(kmeans_result$cluster) table(neurobiber_df$dbcluster) pca <- prcomp(X, center = TRUE, scale. = TRUE) neurobiber_df$PC1 <- pca$x[,1] neurobiber_df$PC2 <- pca$x[,2] ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = phase)) + geom_point(size = 2, alpha = 0.7) + theme_minimal() + labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA", x = "Principal Component 1", y = "Principal Component 2") + facet_wrap(~ source) ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) + geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + theme_minimal() + labs(title = "Across-case comment clusters by feature deployment phase", x = "Feature deployment phase", y = "Neurobiber feature vector cluster (DBSCAN)") + facet_wrap(~ source) cluster_means <- aggregate( X, by = list( WMFAffil = neurobiber_df$AuthorWMFAffil, phase = neurobiber_df$phase, comment_type = neurobiber_df$comment_type, source= neurobiber_df$source ), FUN = mean ) rownames(cluster_means) <- apply( cluster_means[, c("WMFAffil", "phase", "comment_type", "source")], 1, function(x) paste(x, collapse = "_") ) cluster_means <- cluster_means[, !(names(cluster_means) %in% c("WMFAffil", "phase", "comment_type", "source"))] #cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$AuthorWMFAffil), FUN = mean) #rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster) #cluster_means <- cluster_means[,-1] # Remove cluster label column BIBER_FEATURES <- c( "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" ) BIBER_FEATURES_NO_BIN <- gsub("^BIN_", "", BIBER_FEATURES) colnames(cluster_means) <- BIBER_FEATURES_NO_BIN library(pheatmap) pheatmap(cluster_means, cluster_rows = FALSE, # Now features are clustered (rows) cluster_cols = FALSE, scale='none') # Standardize features