preliminary EDA around neurobiber
This commit is contained in:
		
							parent
							
								
									43fb346318
								
							
						
					
					
						commit
						90e69975d2
					
				
							
								
								
									
										80
									
								
								p2/quest/neurobiber_EDA.R
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								p2/quest/neurobiber_EDA.R
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,80 @@ | |||||||
|  | library(tidyverse) | ||||||
|  | 
 | ||||||
|  | neurobiber_csv <-"~/p2/quest/071525_neurobiber_labels.csv" | ||||||
|  | neurobiber_df <- read.csv(neurobiber_csv , header = TRUE)  | ||||||
|  | 
 | ||||||
|  | neurobiber_df$features_vec  <- lapply(neurobiber_df$neurobiber_preds, function(x) { | ||||||
|  |   x <- gsub("\\[|\\]", "", x) | ||||||
|  |   x <- trimws(x) | ||||||
|  |   as.numeric(unlist(strsplit(x, "\\s+"))) | ||||||
|  | }) | ||||||
|  | 
 | ||||||
|  | X <- do.call(rbind, neurobiber_df$features_vec ) | ||||||
|  | 
 | ||||||
|  | set.seed(808) | ||||||
|  | 
 | ||||||
|  | library(dplyr) | ||||||
|  | library(purrr) | ||||||
|  | table(neurobiber_df$source) | ||||||
|  | 
 | ||||||
|  | #neurobiber_df <- neurobiber_df %>% | ||||||
|  | #  group_by(source) %>% | ||||||
|  | #  mutate(cluster = { | ||||||
|  | #    X_sub <- do.call(rbind, features_vec) | ||||||
|  | #    as.factor(kmeans(X_sub, centers = 50)$cluster) | ||||||
|  | #  }) %>% | ||||||
|  | #  ungroup() | ||||||
|  | kmeans_result <- kmeans(X, centers = 10) | ||||||
|  | neurobiber_df$cluster <- as.factor(kmeans_result$cluster) | ||||||
|  | table(neurobiber_df$cluster) | ||||||
|  | 
 | ||||||
|  | pca <- prcomp(X, center = TRUE, scale. = TRUE) | ||||||
|  | neurobiber_df$PC1 <- pca$x[,1] | ||||||
|  | neurobiber_df$PC2 <- pca$x[,2] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ggplot(neurobiber_df, aes(x = PC1, y = PC2, color = cluster)) + | ||||||
|  |   geom_point(size = 2, alpha = 0.7) + | ||||||
|  |   theme_minimal() + | ||||||
|  |   labs(title = "Within case comment clusters (kmeans) by cross-case PCA", | ||||||
|  |        x = "Principal Component 1", | ||||||
|  |        y = "Principal Component 2") + | ||||||
|  |   facet_wrap(~ source) | ||||||
|  | 
 | ||||||
|  | ggplot(neurobiber_df, aes(x = phase, y=cluster, fill=AuthorWMFAffil)) + | ||||||
|  |   geom_violin(trim = FALSE, position = position_dodge(width = 0.8), alpha = 0.6) + | ||||||
|  |   theme_minimal() + | ||||||
|  |   labs(title = "Across-case comment clusters by feature deployment phase", | ||||||
|  |        x = "Feature deployment phase", | ||||||
|  |        y = "Neurobiber feature vector cluster (kmeans)") + | ||||||
|  |   facet_wrap(~ source) | ||||||
|  | 
 | ||||||
|  | cluster_means <- aggregate(X, by = list(Cluster = neurobiber_df$cluster), FUN = mean) | ||||||
|  | rownames(cluster_means) <- paste0("Cluster_", cluster_means$Cluster) | ||||||
|  | cluster_means <- cluster_means[,-1] # Remove cluster label column | ||||||
|  | 
 | ||||||
|  | BIBER_FEATURES <- c( | ||||||
|  |   "BIN_QUAN","BIN_QUPR","BIN_AMP","BIN_PASS","BIN_XX0","BIN_JJ", | ||||||
|  |   "BIN_BEMA","BIN_CAUS","BIN_CONC","BIN_COND","BIN_CONJ","BIN_CONT", | ||||||
|  |   "BIN_DPAR","BIN_DWNT","BIN_EX","BIN_FPP1","BIN_GER","BIN_RB", | ||||||
|  |   "BIN_PIN","BIN_INPR","BIN_TO","BIN_NEMD","BIN_OSUB","BIN_PASTP", | ||||||
|  |   "BIN_VBD","BIN_PHC","BIN_PIRE","BIN_PLACE","BIN_POMD","BIN_PRMD", | ||||||
|  |   "BIN_WZPRES","BIN_VPRT","BIN_PRIV","BIN_PIT","BIN_PUBV","BIN_SPP2", | ||||||
|  |   "BIN_SMP","BIN_SERE","BIN_STPR","BIN_SUAV","BIN_SYNE","BIN_TPP3", | ||||||
|  |   "BIN_TIME","BIN_NOMZ","BIN_BYPA","BIN_PRED","BIN_TOBJ","BIN_TSUB", | ||||||
|  |   "BIN_THVC","BIN_NN","BIN_DEMP","BIN_DEMO","BIN_WHQU","BIN_EMPH", | ||||||
|  |   "BIN_HDG","BIN_WZPAST","BIN_THAC","BIN_PEAS","BIN_ANDC","BIN_PRESP", | ||||||
|  |   "BIN_PROD","BIN_SPAU","BIN_SPIN","BIN_THATD","BIN_WHOBJ","BIN_WHSUB", | ||||||
|  |   "BIN_WHCL","BIN_ART","BIN_AUXB","BIN_CAP","BIN_SCONJ","BIN_CCONJ", | ||||||
|  |   "BIN_DET","BIN_EMOJ","BIN_EMOT","BIN_EXCL","BIN_HASH","BIN_INF", | ||||||
|  |   "BIN_UH","BIN_NUM","BIN_LAUGH","BIN_PRP","BIN_PREP","BIN_NNP", | ||||||
|  |   "BIN_QUES","BIN_QUOT","BIN_AT","BIN_SBJP","BIN_URL","BIN_WH", | ||||||
|  |   "BIN_INDA","BIN_ACCU","BIN_PGAS","BIN_CMADJ","BIN_SPADJ","BIN_X" | ||||||
|  | ) | ||||||
|  | colnames(cluster_means) <- BIBER_FEATURES | ||||||
|  | library(pheatmap) | ||||||
|  | pheatmap(cluster_means,  | ||||||
|  |          cluster_rows = FALSE,    # Now features are clustered (rows) | ||||||
|  |          cluster_cols = TRUE,   # Clusters (columns) are not clustered | ||||||
|  |          scale = "row")         # Standardize features | ||||||
|  | 
 | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user