library(tidyverse) neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv" neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv" neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) # Add comment_type column to each df neurobiber_description_pca_df$comment_type <- "task_description" neurobiber_subcomment_pca_df$comment_type <- "subcomment" # Combine them unified_df <- rbind(neurobiber_description_pca_df, neurobiber_subcomment_pca_df) # look at correlation between PC1, PC2, and different outcome variables library(dplyr) description_anova_results <- neurobiber_description_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) description_anova_results discussion_anova_results <- neurobiber_subcomment_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) discussion_anova_results # look at the representative comments for PC1 and PC2 top5 <- neurobiber_description_pca_df %>% arrange(desc(PC1)) %>% slice(500:510) %>% pull(text) bottom5 <- neurobiber_description_pca_df %>% arrange(PC1) %>% slice(500:510) %>% pull(text) cat("Top 10:20 comment_text by score:\n") print(top5) cat("\nBottom 10:20 comment_text by score:\n") print(bottom5) aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |> group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>% summarise(mean_PC1 = median(PC1), mean_PC2 = median(PC2), mean_PC3 = median(PC3), mean_PC4 = median(PC4), mean_PC5 = median(PC5)) library(scales) library(ggplot2) affiliationColors <- setNames( c('#5da2d8', '#c7756a') ,c("False", "True")) long_df <- aggregated_neurobiber_description_pca_df %>% tidyr::pivot_longer( cols = starts_with("mean_PC"), names_to = "PC", values_to = "PC_value" ) unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] ggplot(unified_df, aes(x = PC3, y = PC4, fill = AuthorWMFAffil)) + geom_point(shape = 21, alpha=0.3, size=2) + facet_grid(source ~ phase) + scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice theme_minimal() + labs( title = "PCs for All Comments (Faceted by Source and Phase)", x = "PC3", y = "PC4", fill = "Comment Type" )