diff --git a/p2/quest/neurobiber_EDA.R b/p2/quest/neurobiber_EDA.R index 01ce1ff..c445de3 100644 --- a/p2/quest/neurobiber_EDA.R +++ b/p2/quest/neurobiber_EDA.R @@ -1,15 +1,16 @@ library(tidyverse) -neurobiber_csv <-"~/p2/quest/071525_neurobiber_labels.csv" +neurobiber_csv <-"~/p2/quest/072325_biberplus_labels.csv" neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) -neurobiber_df$features_vec <- lapply(neurobiber_df$neurobiber_preds, function(x) { - x <- gsub("\\[|\\]", "", x) - x <- trimws(x) - as.numeric(unlist(strsplit(x, "\\s+"))) -}) +normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE) + +neurobiber_df$normalized_features_vec <- lapply( + asplit(neurobiber_df[, normalized_cols], 1), as.numeric +) + +X <- do.call(rbind, neurobiber_df$normalized_features_vec) -X <- do.call(rbind, neurobiber_df$features_vec ) set.seed(808) @@ -37,7 +38,7 @@ neurobiber_df$PC1 <- pca$x[,1] neurobiber_df$PC2 <- pca$x[,2] -ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = dbcluster)) + +ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = phase)) + geom_point(size = 2, alpha = 0.7) + theme_minimal() + labs(title = "Across-case comment clusters (DBSCAN) by cross-case PCA", @@ -53,9 +54,27 @@ ggplot(neurobiber_df, aes(x = phase, y=dbcluster, fill=AuthorWMFAffil)) + y = "Neurobiber feature vector cluster (DBSCAN)") + facet_wrap(~ source) -cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$dbcluster), FUN = mean) -rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster) -cluster_means <- cluster_means[,-1] # Remove cluster label column +cluster_means <- aggregate( + X, + by = list( + WMFAffil = neurobiber_df$AuthorWMFAffil, + phase = neurobiber_df$phase, + comment_type = neurobiber_df$comment_type, + source= neurobiber_df$source + ), + FUN = mean +) + +rownames(cluster_means) <- apply( + cluster_means[, c("WMFAffil", "phase", "comment_type", "source")], 1, + function(x) paste(x, collapse = "_") +) + +cluster_means <- cluster_means[, !(names(cluster_means) %in% c("WMFAffil", "phase", "comment_type", "source"))] + +#cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$AuthorWMFAffil), FUN = mean) +#rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster) +#cluster_means <- cluster_means[,-1] # Remove cluster label column BIBER_FEATURES <- c( "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", @@ -75,10 +94,11 @@ BIBER_FEATURES <- c( "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" ) -colnames(cluster_means) <- BIBER_FEATURES +BIBER_FEATURES_NO_BIN <- gsub("^BIN_", "", BIBER_FEATURES) +colnames(cluster_means) <- BIBER_FEATURES_NO_BIN library(pheatmap) pheatmap(cluster_means, cluster_rows = FALSE, # Now features are clustered (rows) - cluster_cols = FALSE, # Clusters (columns) are not clustered - scale = "row") # Standardize features + cluster_cols = FALSE, + scale='none') # Standardize features