updated PCA analysis
This commit is contained in:
parent
0843685707
commit
d86233abca
17
mgaughan-rstudio-server_30110461.out
Normal file
17
mgaughan-rstudio-server_30110461.out
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:n3439:51247 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: mjilg
|
||||||
|
password: z93icQDhumWD6WUbUC34
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f 30110461
|
||||||
@ -1,17 +1,16 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
|
library(dplyr)
|
||||||
neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
|
neurobiber_description_pca_csv <-"~/p2/quest/101325_description_PCA_df.csv"
|
||||||
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
|
neurobiber_subcomment_pca_csv <-"~/p2/quest/101325_subcomment_PCA_df.csv"
|
||||||
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||||
main_df <- read.csv(main_csv , header = TRUE)
|
main_df <- read.csv(main_csv , header = TRUE)
|
||||||
|
|
||||||
main_df <- main_df |>
|
main_df <- main_df |>
|
||||||
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title)
|
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title, priority)
|
||||||
|
|
||||||
# Join main_df to neurobiber_description_pca_df
|
# Join main_df to neurobiber_description_pca_df
|
||||||
description_joined <- main_df |>
|
description_joined <- main_df |>
|
||||||
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
||||||
@ -59,7 +58,6 @@ subcomment_joined <- subcomment_joined %>%
|
|||||||
neurobiber_description_pca_df$TaskPHID)))
|
neurobiber_description_pca_df$TaskPHID)))
|
||||||
|
|
||||||
# look at correlation between PC1, PC2, and different outcome variables
|
# look at correlation between PC1, PC2, and different outcome variables
|
||||||
library(dplyr)
|
|
||||||
description_anova_results <- neurobiber_description_pca_df %>%
|
description_anova_results <- neurobiber_description_pca_df %>%
|
||||||
group_by(source) %>%
|
group_by(source) %>%
|
||||||
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
||||||
@ -71,20 +69,20 @@ discussion_anova_results <- neurobiber_subcomment_pca_df %>%
|
|||||||
discussion_anova_results
|
discussion_anova_results
|
||||||
|
|
||||||
# look at the representative comments for PC1 and PC2
|
# look at the representative comments for PC1 and PC2
|
||||||
top5 <- neurobiber_subcomment_pca_df %>%
|
top5 <- neurobiber_description_pca_df %>%
|
||||||
arrange(desc(PC6)) %>%
|
arrange(desc(PC2)) %>%
|
||||||
slice(300:310) %>%
|
slice(300:310) %>%
|
||||||
pull(cleaned_comment)
|
pull(cleaned_comment)
|
||||||
|
|
||||||
bottom5 <- neurobiber_subcomment_pca_df %>%
|
bottom5 <- neurobiber_description_pca_df %>%
|
||||||
arrange(PC6) %>%
|
arrange(PC2) %>%
|
||||||
slice(300:310) %>%
|
slice(300:310) %>%
|
||||||
pull(cleaned_comment)
|
pull(cleaned_comment)
|
||||||
|
|
||||||
cat("Top 300:310 comment_text by PC2 score:\n")
|
cat("Top 300:310 comment_text by PC2 score:\n")
|
||||||
print(top5)
|
print(top5)
|
||||||
|
|
||||||
cat("\nBottom 300:310 comment_text by PC1 score:\n")
|
cat("\nBottom 300:310 comment_text by PC2 score:\n")
|
||||||
print(bottom5)
|
print(bottom5)
|
||||||
|
|
||||||
|
|
||||||
@ -97,23 +95,68 @@ affiliationColors <-
|
|||||||
,c("False", "True"))
|
,c("False", "True"))
|
||||||
|
|
||||||
subcomment_joined_no_gerrit <- subcomment_joined |>
|
subcomment_joined_no_gerrit <- subcomment_joined |>
|
||||||
filter(isGerritBot != "TRUE")
|
filter(isGerritBot != "TRUE") |>
|
||||||
|
left_join(neurobiber_description_pca_df |> select(TaskPHID, priority), by = "TaskPHID")
|
||||||
|
|
||||||
|
|
||||||
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
||||||
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
||||||
# geom_point(shape = 21, alpha=0.4, size=2) +
|
# geom_point(shape = 21, alpha=0.4, size=2) +
|
||||||
# geom_bin_2d() +
|
# geom_bin_2d() +
|
||||||
ggplot(subcomment_joined_no_gerrit, aes(x = PC2, y = PC1, fill = isAuthorWMF)) +
|
|
||||||
facet_grid(source ~ pair_in_description, scales="fixed") +
|
sampled_authors <- subcomment_joined_no_gerrit %>%
|
||||||
|
distinct(AuthorPHID) %>%
|
||||||
|
sample_n(100) %>%
|
||||||
|
pull(AuthorPHID)
|
||||||
|
|
||||||
|
# 2. Filter original data to just those authors
|
||||||
|
sub_sample <- subcomment_joined_no_gerrit %>%
|
||||||
|
filter(AuthorPHID %in% sampled_authors)
|
||||||
|
|
||||||
|
description_sampled_authors <- description_joined %>%
|
||||||
|
distinct(AuthorPHID) %>%
|
||||||
|
sample_n(8) %>%
|
||||||
|
pull(AuthorPHID)
|
||||||
|
|
||||||
|
# 2. Filter original data to just those authors
|
||||||
|
description_sub_sample <- description_joined %>%
|
||||||
|
filter(AuthorPHID %in% description_sampled_authors)
|
||||||
|
|
||||||
|
ggplot(description_sub_sample, aes(x = PC2, y = PC1, fill = AuthorPHID)) +
|
||||||
|
facet_grid(source~phase, scales="fixed") +
|
||||||
geom_point(shape = 21, alpha=0.3, size=2) +
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||||
xlim(-15, 15) +
|
xlim(-30, 30) +
|
||||||
ylim(-15, 15) +
|
ylim(-30, 30) +
|
||||||
scale_fill_viridis_d() +
|
scale_fill_brewer(palette = "Set1") +
|
||||||
theme_minimal() +
|
theme_minimal() +
|
||||||
|
guides(fill = "none") +
|
||||||
labs(
|
labs(
|
||||||
title = "PCs for Task Comments (Faceted by source and pair_in_description)",
|
title = "PCs for Task Comments (Faceted by source and phase)",
|
||||||
x = "PC2",
|
x = "PC2",
|
||||||
y = "PC1",
|
y = "PC1",
|
||||||
|
)
|
||||||
|
|
||||||
|
priority_order <- c("Unbreak Now!", "High", "Medium", "Low", "Lowest", "Needs Triage")
|
||||||
|
|
||||||
|
subcomment_joined_no_gerrit <- subcomment_joined_no_gerrit %>%
|
||||||
|
mutate(priority = factor(priority, levels = priority_order))
|
||||||
|
|
||||||
|
description_joined <- description_joined %>%
|
||||||
|
mutate(priority = factor(priority.y, levels = priority_order))
|
||||||
|
|
||||||
|
ggplot(description_joined, aes(
|
||||||
|
x = as.factor(priority), # x-axis grouping
|
||||||
|
y = PC2,
|
||||||
|
fill = AuthorPHID
|
||||||
|
)) +
|
||||||
|
ylim(-20, 20) +
|
||||||
|
geom_boxplot(alpha = 0.7, position = position_dodge(width = 0.9)) +
|
||||||
|
facet_grid(. ~ source, scales = "fixed") + # Facet by source; adjust as needed
|
||||||
|
scale_fill_viridis_d() +
|
||||||
|
theme_minimal() +
|
||||||
|
labs(
|
||||||
|
title = "Boxplot of PC2 for Task Descriptions",
|
||||||
|
x = "Task priority",
|
||||||
|
y = "PC2",
|
||||||
fill = "isAuthorWMF?"
|
fill = "isAuthorWMF?"
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user