preliminary EDA on the PCA analysis
This commit is contained in:
parent
b21ecb02c3
commit
acd8964e73
17
mgaughan-rstudio-server_29836350.out
Normal file
17
mgaughan-rstudio-server_29836350.out
Normal file
@ -0,0 +1,17 @@
|
||||
1. SSH tunnel from your workstation using the following command:
|
||||
|
||||
ssh -N -L 8787:n3439:44313 mjilg@klone.hyak.uw.edu
|
||||
|
||||
and point your web browser to http://localhost:8787
|
||||
|
||||
2. log in to RStudio Server using the following credentials:
|
||||
|
||||
user: mjilg
|
||||
password: inZLmycTjFs4aqmFIMs1
|
||||
|
||||
When done using RStudio Server, terminate the job by:
|
||||
|
||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||
2. Issue the following command on the login node:
|
||||
|
||||
scancel -f 29836350
|
||||
@ -1,12 +1,19 @@
|
||||
library(tidyverse)
|
||||
|
||||
neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv"
|
||||
neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
|
||||
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
|
||||
|
||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv"
|
||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
|
||||
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
|
||||
|
||||
|
||||
# Add comment_type column to each df
|
||||
neurobiber_description_pca_df$comment_type <- "task_description"
|
||||
neurobiber_subcomment_pca_df$comment_type <- "subcomment"
|
||||
|
||||
# Combine them
|
||||
unified_df <- rbind(neurobiber_description_pca_df, neurobiber_subcomment_pca_df)
|
||||
|
||||
# look at correlation between PC1, PC2, and different outcome variables
|
||||
library(dplyr)
|
||||
description_anova_results <- neurobiber_description_pca_df %>%
|
||||
@ -20,27 +27,25 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>%
|
||||
discussion_anova_results
|
||||
|
||||
# look at the representative comments for PC1 and PC2
|
||||
top5 <- neurobiber_subcomment_pca_df %>%
|
||||
filter(source=="c2") |>
|
||||
arrange(desc(PC2)) %>%
|
||||
slice(15:30) %>%
|
||||
top5 <- neurobiber_description_pca_df %>%
|
||||
arrange(desc(PC1)) %>%
|
||||
slice(500:510) %>%
|
||||
pull(text)
|
||||
|
||||
bottom5 <- neurobiber_subcomment_pca_df %>%
|
||||
filter(source=="c2") |>
|
||||
arrange(PC2) %>%
|
||||
slice(15:30) %>%
|
||||
bottom5 <- neurobiber_description_pca_df %>%
|
||||
arrange(PC1) %>%
|
||||
slice(500:510) %>%
|
||||
pull(text)
|
||||
|
||||
cat("Top 15:30 comment_text by score:\n")
|
||||
cat("Top 10:20 comment_text by score:\n")
|
||||
print(top5)
|
||||
|
||||
cat("\nBottom 15:30 comment_text by score:\n")
|
||||
cat("\nBottom 10:20 comment_text by score:\n")
|
||||
print(bottom5)
|
||||
|
||||
|
||||
aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
|
||||
group_by(AuthorWMFAffil, week_index, source, priority) %>%
|
||||
group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
|
||||
summarise(mean_PC1 = median(PC1),
|
||||
mean_PC2 = median(PC2),
|
||||
mean_PC3 = median(PC3),
|
||||
@ -62,13 +67,16 @@ long_df <- aggregated_neurobiber_description_pca_df %>%
|
||||
values_to = "PC_value"
|
||||
)
|
||||
|
||||
ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) +
|
||||
geom_line(size = 1) +
|
||||
facet_grid(PC ~ source, scales = "free_y") +
|
||||
scale_color_manual(values = affiliationColors, name = "WMF Affiliation") +
|
||||
scale_x_continuous(breaks = pretty_breaks()) +
|
||||
scale_y_continuous(limits = c(-10, 10)) +
|
||||
labs(x = "Week Index", y = "Mean PC Value",
|
||||
title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") +
|
||||
theme_minimal(base_size = 14) +
|
||||
theme(legend.position = "top")
|
||||
unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
||||
unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
||||
ggplot(unified_df, aes(x = PC3, y = PC4, fill = AuthorWMFAffil)) +
|
||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||
facet_grid(source ~ phase) +
|
||||
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
|
||||
theme_minimal() +
|
||||
labs(
|
||||
title = "PCs for All Comments (Faceted by Source and Phase)",
|
||||
x = "PC3",
|
||||
y = "PC4",
|
||||
fill = "Comment Type"
|
||||
)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user