1
0

updating some of the scripts for PCA analysis

This commit is contained in:
Matthew Gaughan 2025-10-20 11:09:04 -07:00
parent f146016eac
commit b198781aa0

View File

@ -59,6 +59,7 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment"
#clean the messages #clean the messages
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment) neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment) neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
total_joined$cleaned_comment <- sapply(total_joined$text, preprocess_comment)
subcomment_joined <- subcomment_joined %>% subcomment_joined <- subcomment_joined %>%
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in% mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
@ -77,13 +78,13 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>%
discussion_anova_results discussion_anova_results
# look at the representative comments for PC1 and PC2 # look at the representative comments for PC1 and PC2
top5 <- neurobiber_description_pca_df %>% top5 <- total_joined %>%
arrange(desc(PC2)) %>% arrange(desc(PC4)) %>%
slice(300:310) %>% slice(300:310) %>%
pull(cleaned_comment) pull(cleaned_comment)
bottom5 <- neurobiber_description_pca_df %>% bottom5 <- total_joined %>%
arrange(PC2) %>% arrange(PC4) %>%
slice(300:310) %>% slice(300:310) %>%
pull(cleaned_comment) pull(cleaned_comment)
@ -130,18 +131,17 @@ description_sampled_authors <- description_joined %>%
description_sub_sample <- description_joined %>% description_sub_sample <- description_joined %>%
filter(AuthorPHID %in% description_sampled_authors) filter(AuthorPHID %in% description_sampled_authors)
ggplot(description_sub_sample, aes(x = PC2, y = PC1, fill = AuthorPHID)) + ggplot(total_joined, aes(x = PC4, y = PC3, fill = comment_type)) +
facet_grid(source~phase, scales="fixed") + facet_grid(source~phase, scales="fixed") +
geom_point(shape = 21, alpha=0.3, size=2) + geom_point(shape = 21, alpha=0.3, size=2) +
xlim(-30, 30) + xlim(-30, 30) +
ylim(-30, 30) + ylim(-30, 30) +
scale_fill_brewer(palette = "Set1") + scale_fill_viridis_d() +
theme_minimal() + theme_minimal() +
guides(fill = "none") +
labs( labs(
title = "PCs for Task Comments (Faceted by source and phase)", title = "PCs for Task Comments (Faceted by source and phase)",
x = "PC2", x = "PC4",
y = "PC1", y = "PC3",
) )
priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage") priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
@ -153,18 +153,18 @@ description_joined <- description_joined %>%
mutate(priority = factor(priority.y, levels = priority_order)) mutate(priority = factor(priority.y, levels = priority_order))
ggplot(total_joined, aes( ggplot(total_joined, aes(
x = PC1, # x-axis grouping x = as.factor(comment_type), # x-axis grouping
y = PC2, y = PC3,
fill = comment_type fill = isAuthorWMF
)) + )) +
ylim(-20, 20) + ylim(-30, 30) +
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) + geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
scale_fill_viridis_d() + scale_fill_viridis_d() +
theme_minimal() + theme_minimal() +
labs( labs(
title = "Boxplot of PC2 for Task Descriptions", title = "Boxplot of PC4",
x = "Task priority", x = "Comment_type",
y = "PC2", y = "PC4",
fill = "isAuthorWMF?" fill = "isAuthorWMF?"
) )