updated PCA analysis, ready for rob tomorrow
This commit is contained in:
parent
f636969541
commit
7f89fd1966
23
dsl/human_sampling.R
Normal file
23
dsl/human_sampling.R
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
main_csv <-"~/analysis_data/092925_unified_phab.csv"
|
||||||
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
|
set.seed(123) # For reproducibility
|
||||||
|
|
||||||
|
sampled_df <- main_df %>%
|
||||||
|
group_by(source) %>%
|
||||||
|
mutate(sampled_TaskPHID = TaskPHID %in% sample(unique(TaskPHID), 30)) %>%
|
||||||
|
ungroup() %>%
|
||||||
|
filter(sampled_TaskPHID) %>%
|
||||||
|
select(-sampled_TaskPHID)
|
||||||
|
|
||||||
|
sentence_level_sample <- sampled_df |>
|
||||||
|
mutate(cleaned_sentences = str_extract_all(olmo_cleaned_sentences, "(?<=')[^']+(?=')")) |>
|
||||||
|
unnest(cleaned_sentences)|>
|
||||||
|
filter(cleaned_sentences != ", ") |>
|
||||||
|
select(-olmo_sentence_categories, -starts_with("normalized"), -starts_with("gerrit"))
|
||||||
|
|
||||||
|
(nrow(sentence_level_sample) / 293) * 1.5
|
||||||
|
|
||||||
|
#write.csv(output_df, "100125_human_info_sample.csv", row.names = FALSE)
|
||||||
@ -1,17 +1,17 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
ssh -N -L 8787:n3439:53255 mjilg@klone.hyak.uw.edu
|
ssh -N -L 8787:n3441:52613 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
user: mjilg
|
user: mjilg
|
||||||
password: eSK3QbcwgGpUya1wJIvC
|
password: YBcIVAgxBCfkvg2tbQqI
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
2. Issue the following command on the login node:
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
scancel -f 29920945
|
scancel -f 29944433
|
||||||
@ -1,9 +1,9 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
|
|
||||||
neurobiber_description_pca_csv <-"~/p2/quest/092325_description_PCA_df.csv"
|
neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
|
||||||
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
|
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
|
||||||
|
|
||||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/092325_subcomment_PCA_df.csv"
|
neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
|
||||||
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
|
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
|
||||||
|
|
||||||
|
|
||||||
@ -38,6 +38,10 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment"
|
|||||||
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
|
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
|
||||||
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
|
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
|
||||||
|
|
||||||
|
neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>%
|
||||||
|
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
|
||||||
|
paste(neurobiber_description_pca_df$AuthorPHID,
|
||||||
|
neurobiber_description_pca_df$TaskPHID)))
|
||||||
|
|
||||||
# look at correlation between PC1, PC2, and different outcome variables
|
# look at correlation between PC1, PC2, and different outcome variables
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
@ -53,29 +57,22 @@ discussion_anova_results
|
|||||||
|
|
||||||
# look at the representative comments for PC1 and PC2
|
# look at the representative comments for PC1 and PC2
|
||||||
top5 <- neurobiber_subcomment_pca_df %>%
|
top5 <- neurobiber_subcomment_pca_df %>%
|
||||||
arrange(desc(PC6)) %>%
|
arrange(desc(PC2)) %>%
|
||||||
slice(300:310) %>%
|
slice(300:310) %>%
|
||||||
pull(cleaned_comment)
|
pull(cleaned_comment)
|
||||||
|
|
||||||
bottom5 <- neurobiber_subcomment_pca_df %>%
|
bottom5 <- neurobiber_subcomment_pca_df %>%
|
||||||
arrange(PC6) %>%
|
arrange(PC2) %>%
|
||||||
slice(300:310) %>%
|
slice(300:310) %>%
|
||||||
pull(cleaned_comment)
|
pull(cleaned_comment)
|
||||||
|
|
||||||
cat("Top 300:310 comment_text by PC1 score:\n")
|
cat("Top 300:310 comment_text by PC2 score:\n")
|
||||||
print(top5)
|
print(top5)
|
||||||
|
|
||||||
cat("\nBottom 300:310 comment_text by PC1 score:\n")
|
cat("\nBottom 300:310 comment_text by PC1 score:\n")
|
||||||
print(bottom5)
|
print(bottom5)
|
||||||
|
|
||||||
|
|
||||||
aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
|
|
||||||
group_by(AuthorWMFAffil, week_index, source, priority, closed_relevance, phase) %>%
|
|
||||||
summarise(mean_PC1 = median(PC1),
|
|
||||||
mean_PC2 = median(PC2),
|
|
||||||
mean_PC3 = median(PC3),
|
|
||||||
mean_PC4 = median(PC4),
|
|
||||||
mean_PC5 = median(PC5))
|
|
||||||
library(scales)
|
library(scales)
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
|
|
||||||
@ -85,23 +82,28 @@ affiliationColors <-
|
|||||||
,c("False", "True"))
|
,c("False", "True"))
|
||||||
|
|
||||||
|
|
||||||
long_df <- aggregated_neurobiber_description_pca_df %>%
|
neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>%
|
||||||
tidyr::pivot_longer(
|
left_join(
|
||||||
cols = starts_with("mean_PC"),
|
neurobiber_description_pca_df %>%
|
||||||
names_to = "PC",
|
select(TaskPHID, priority),
|
||||||
values_to = "PC_value"
|
by = "TaskPHID"
|
||||||
)
|
) |>
|
||||||
|
filter(priority.y %in% c("Lowest","Unbreak Now!"))
|
||||||
|
|
||||||
unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
neurobiber_description_pca_df <- neurobiber_description_pca_df |>
|
||||||
unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
filter(priority %in% c("Lowest","Unbreak Now!"))
|
||||||
ggplot(neurobiber_description_pca_df, aes(x = PC1, y = PC3, fill = closed_relevance)) +
|
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
||||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
||||||
facet_grid(source ~ phase) +
|
ggplot(neurobiber_description_pca_df, aes(x = PC3, y = PC8, fill = priority)) +
|
||||||
|
geom_point(shape = 21, alpha=0.4, size=2) +
|
||||||
|
facet_grid(source ~ phase, scales="fixed") +
|
||||||
|
xlim(-10, 10) +
|
||||||
|
ylim(-10, 10) +
|
||||||
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
|
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
|
||||||
theme_minimal() +
|
theme_minimal() +
|
||||||
labs(
|
labs(
|
||||||
title = "PCs for Task Subcomments (Faceted by Source and Phase)",
|
title = "PCs for Task Descriptions (Faceted by Source and Phase)",
|
||||||
x = "PC1",
|
x = "PC3",
|
||||||
y = "PC3",
|
y = "PC8",
|
||||||
fill = "(tentative affiliation)"
|
fill = "author_same_as_task_creator?"
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user