1
0

updated PCA analysis, ready for rob tomorrow

This commit is contained in:
Matthew Gaughan 2025-10-01 20:58:55 -07:00
parent f636969541
commit 7f89fd1966
3 changed files with 55 additions and 30 deletions

23
dsl/human_sampling.R Normal file
View File

@ -0,0 +1,23 @@
library(tidyverse)
main_csv <-"~/analysis_data/092925_unified_phab.csv"
main_df <- read.csv(main_csv, header = TRUE)
set.seed(123) # For reproducibility
sampled_df <- main_df %>%
group_by(source) %>%
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>%
ungroup() %>%
filter(sampled_TaskPHID) %>%
select(-sampled_TaskPHID)
sentence_level_sample <- sampled_df |>
mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |>
unnest(cleaned_sentences)|>
filter(cleaned_sentences != ", ") |>
select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit"))
(nrow(sentence_level_sample) / 293) * 1.5
#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE)

View File

@ -1,17 +1,17 @@
1. SSH tunnel from your workstation using the following command: 1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3439:53255 mjilg@klone.hyak.uw.edu ssh -N -L 8787:n3441:52613 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787 and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials: 2. log in to RStudio Server using the following credentials:
user: mjilg user: mjilg
password: eSK3QbcwgGpUya1wJIvC password: YBcIVAgxBCfkvg2tbQqI
When done using RStudio Server, terminate the job by: When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) 1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node: 2. Issue the following command on the login node:
scancel -f 29920945 scancel -f 29944433

View File

@ -1,9 +1,9 @@
library(tidyverse) library(tidyverse)
neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv" neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv" neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
@ -38,6 +38,10 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment"
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment) neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment) neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>%
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
paste(neurobiber_description_pca_df$AuthorPHID,
neurobiber_description_pca_df$TaskPHID)))
# look at correlation between PC1, PC2, and different outcome variables # look at correlation between PC1, PC2, and different outcome variables
library(dplyr) library(dplyr)
@ -53,29 +57,22 @@ discussion_anova_results
# look at the representative comments for PC1 and PC2 # look at the representative comments for PC1 and PC2
top5 <- neurobiber_subcomment_pca_df %>% top5 <- neurobiber_subcomment_pca_df %>%
arrange(desc(PC6)) %>% arrange(desc(PC2)) %>%
slice(300:310) %>% slice(300:310) %>%
pull(cleaned_comment) pull(cleaned_comment)
bottom5 <- neurobiber_subcomment_pca_df %>% bottom5 <- neurobiber_subcomment_pca_df %>%
arrange(PC6) %>% arrange(PC2) %>%
slice(300:310) %>% slice(300:310) %>%
pull(cleaned_comment) pull(cleaned_comment)
cat("Top 300:310 comment_text by PC1 score:\n") cat("Top 300:310 comment_text by PC2 score:\n")
print(top5) print(top5)
cat("\nBottom 300:310 comment_text by PC1 score:\n") cat("\nBottom 300:310 comment_text by PC1 score:\n")
print(bottom5) print(bottom5)
aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
summarise(mean_PC1 = median(PC1),
mean_PC2 = median(PC2),
mean_PC3 = median(PC3),
mean_PC4 = median(PC4),
mean_PC5 = median(PC5))
library(scales) library(scales)
library(ggplot2) library(ggplot2)
@ -85,23 +82,28 @@ affiliationColors <-
,c("False", "True")) ,c("False", "True"))
long_df <- aggregated_neurobiber_description_pca_df %>% neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>%
tidyr::pivot_longer( left_join(
cols = starts_with("mean_PC"), neurobiber_description_pca_df %>%
names_to = "PC", select(TaskPHID, priority),
values_to = "PC_value" by = "TaskPHID"
) ) |>
filter(priority.y %in% c("Lowest","Unbreak Now!"))
unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) neurobiber_description_pca_df <- neurobiber_description_pca_df |>
unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] filter(priority %in% c("Lowest","Unbreak Now!"))
ggplot(neurobiber_description_pca_df, aes(x = PC1, y = PC3, fill = closed_relevance)) + #unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
geom_point(shape = 21, alpha=0.3, size=2) + #unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
facet_grid(source ~ phase) + ggplot(neurobiber_description_pca_df, aes(x = PC3, y = PC8, fill = priority)) +
geom_point(shape = 21, alpha=0.4, size=2) +
facet_grid(source ~ phase, scales="fixed") +
xlim(-10, 10) +
ylim(-10, 10) +
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
theme_minimal() + theme_minimal() +
labs( labs(
title = "PCs for Task Subcomments (Faceted by Source and Phase)", title = "PCs for Task Descriptions (Faceted by Source and Phase)",
x = "PC1", x = "PC3",
y = "PC3", y = "PC8",
fill = "(tentative affiliation)" fill = "author_same_as_task_creator?"
) )