diff --git a/mgaughan-rstudio-server_30110461.out b/mgaughan-rstudio-server_30110461.out new file mode 100644 index 0000000..3617899 --- /dev/null +++ b/mgaughan-rstudio-server_30110461.out @@ -0,0 +1,17 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3439:51247 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: z93icQDhumWD6WUbUC34 + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 30110461 diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R index 8daf6ef..ff4f7c1 100644 --- a/p2/quest/neurobiber_PCA_analysis.R +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -1,17 +1,16 @@ library(tidyverse) - -neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv" +library(dplyr) +neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv" neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text) -neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv" +neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv" neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text) main_csv <- "~/analysis_data/100625_unified_w_affil.csv" main_df <- read.csv(main_csv , header = TRUE) main_df <- main_df |> - select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title) - + select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority) # Join main_df to neurobiber_description_pca_df description_joined <- main_df |> right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |> @@ -59,7 +58,6 @@ subcomment_joined <- subcomment_joined %>% neurobiber_description_pca_df$TaskPHID))) # look at correlation between PC1, PC2, and different outcome variables -library(dplyr) description_anova_results <- neurobiber_description_pca_df %>% group_by(source) %>% group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) @@ -71,20 +69,20 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>% discussion_anova_results # look at the representative comments for PC1 and PC2 -top5 <- neurobiber_subcomment_pca_df %>% - arrange(desc(PC6)) %>% +top5 <- neurobiber_description_pca_df %>% + arrange(desc(PC2)) %>% slice(300:310) %>% pull(cleaned_comment) -bottom5 <- neurobiber_subcomment_pca_df %>% - arrange(PC6) %>% +bottom5 <- neurobiber_description_pca_df %>% + arrange(PC2) %>% slice(300:310) %>% pull(cleaned_comment) cat("Top 300:310 comment_text by PC2 score:\n") print(top5) -cat("\nBottom 300:310 comment_text by PC1 score:\n") +cat("\nBottom 300:310 comment_text by PC2 score:\n") print(bottom5) @@ -97,23 +95,68 @@ affiliationColors <- ,c("False", "True")) subcomment_joined_no_gerrit <- subcomment_joined |> - filter(isGerritBot != "TRUE") + filter(isGerritBot != "TRUE") |> + left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID") #unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) #unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] # geom_point(shape = 21, alpha=0.4, size=2) + # geom_bin_2d() + -ggplot(subcomment_joined_no_gerrit, aes(x = PC2, y = PC1, fill = isAuthorWMF)) + - facet_grid(source ~ pair_in_description, scales="fixed") + + +sampled_authors <- subcomment_joined_no_gerrit %>% + distinct(AuthorPHID) %>% + sample_n(100) %>% + pull(AuthorPHID) + +# 2. Filter original data to just those authors +sub_sample <- subcomment_joined_no_gerrit %>% + filter(AuthorPHID %in% sampled_authors) + +description_sampled_authors <- description_joined %>% + distinct(AuthorPHID) %>% + sample_n(8) %>% + pull(AuthorPHID) + +# 2. Filter original data to just those authors +description_sub_sample <- description_joined %>% + filter(AuthorPHID %in% description_sampled_authors) + +ggplot(description_sub_sample, aes(x = PC2, y = PC1, fill = AuthorPHID)) + + facet_grid(source~phase, scales="fixed") + geom_point(shape = 21, alpha=0.3, size=2) + - xlim(-15, 15) + - ylim(-15, 15) + - scale_fill_viridis_d() + + xlim(-30, 30) + + ylim(-30, 30) + + scale_fill_brewer(palette = "Set1") + theme_minimal() + + guides(fill = "none") + labs( - title = "PCs for Task Comments (Faceted by source and pair_in_description)", + title = "PCs for Task Comments (Faceted by source and phase)", x = "PC2", y = "PC1", + ) + +priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage") + +subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>% + mutate(priority = factor(priority, levels = priority_order)) + +description_joined <- description_joined %>% + mutate(priority = factor(priority.y, levels = priority_order)) + +ggplot(description_joined, aes( + x = as.factor(priority), # x-axis grouping + y = PC2, + fill = AuthorPHID +)) + + ylim(-20, 20) + + geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) + + facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed + scale_fill_viridis_d() + + theme_minimal() + + labs( + title = "Boxplot of PC2 for Task Descriptions", + x = "Task priority", + y = "PC2", fill = "isAuthorWMF?" )