83 lines
2.5 KiB
R
83 lines
2.5 KiB
R
library(tidyverse)
|
|
|
|
neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
|
|
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
|
|
|
|
neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
|
|
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
|
|
|
|
|
|
# Add comment_type column to each df
|
|
neurobiber_description_pca_df$comment_type <- "task_description"
|
|
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
|
|
|
|
# Combine them
|
|
unified_df <- rbind(neurobiber_description_pca_df, neurobiber_subcomment_pca_df)
|
|
|
|
# look at correlation between PC1, PC2, and different outcome variables
|
|
library(dplyr)
|
|
description_anova_results <- neurobiber_description_pca_df %>%
|
|
group_by(source) %>%
|
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
|
description_anova_results
|
|
|
|
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
|
|
group_by(source) %>%
|
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
|
discussion_anova_results
|
|
|
|
# look at the representative comments for PC1 and PC2
|
|
top5 <- neurobiber_description_pca_df %>%
|
|
arrange(desc(PC1)) %>%
|
|
slice(500:510) %>%
|
|
pull(text)
|
|
|
|
bottom5 <- neurobiber_description_pca_df %>%
|
|
arrange(PC1) %>%
|
|
slice(500:510) %>%
|
|
pull(text)
|
|
|
|
cat("Top 10:20 comment_text by score:\n")
|
|
print(top5)
|
|
|
|
cat("\nBottom 10:20 comment_text by score:\n")
|
|
print(bottom5)
|
|
|
|
|
|
aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
|
|
group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
|
|
summarise(mean_PC1 = median(PC1),
|
|
mean_PC2 = median(PC2),
|
|
mean_PC3 = median(PC3),
|
|
mean_PC4 = median(PC4),
|
|
mean_PC5 = median(PC5))
|
|
library(scales)
|
|
library(ggplot2)
|
|
|
|
|
|
affiliationColors <-
|
|
setNames( c('#5da2d8', '#c7756a')
|
|
,c("False", "True"))
|
|
|
|
|
|
long_df <- aggregated_neurobiber_description_pca_df %>%
|
|
tidyr::pivot_longer(
|
|
cols = starts_with("mean_PC"),
|
|
names_to = "PC",
|
|
values_to = "PC_value"
|
|
)
|
|
|
|
unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
|
unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
|
ggplot(unified_df, aes(x = PC3, y = PC4, fill = AuthorWMFAffil)) +
|
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
|
facet_grid(source ~ phase) +
|
|
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
|
|
theme_minimal() +
|
|
labs(
|
|
title = "PCs for All Comments (Faceted by Source and Phase)",
|
|
x = "PC3",
|
|
y = "PC4",
|
|
fill = "Comment Type"
|
|
)
|