some neurobiber PCA analysis
This commit is contained in:
parent
a96fd6db2f
commit
6de62f2447
18
mgaughan-rstudio-server_28911380.out
Normal file
18
mgaughan-rstudio-server_28911380.out
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
1. SSH tunnel from your workstation using the following command:
|
||||||
|
|
||||||
|
ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu
|
||||||
|
|
||||||
|
and point your web browser to http://localhost:8787
|
||||||
|
|
||||||
|
2. log in to RStudio Server using the following credentials:
|
||||||
|
|
||||||
|
user: mjilg
|
||||||
|
password: 9Qgk9UkRdmKalTKyDmH4
|
||||||
|
|
||||||
|
When done using RStudio Server, terminate the job by:
|
||||||
|
|
||||||
|
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
||||||
|
2. Issue the following command on the login node:
|
||||||
|
|
||||||
|
scancel -f 28911380
|
||||||
|
[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT ***
|
74
p2/quest/neurobiber_PCA_analysis.R
Normal file
74
p2/quest/neurobiber_PCA_analysis.R
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv"
|
||||||
|
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
|
||||||
|
|
||||||
|
neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv"
|
||||||
|
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
|
||||||
|
|
||||||
|
|
||||||
|
# look at correlation between PC1, PC2, and different outcome variables
|
||||||
|
library(dplyr)
|
||||||
|
description_anova_results <- neurobiber_description_pca_df %>%
|
||||||
|
group_by(source) %>%
|
||||||
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
||||||
|
description_anova_results
|
||||||
|
|
||||||
|
discussion_anova_results <- neurobiber_subcomment_pca_df %>%
|
||||||
|
group_by(source) %>%
|
||||||
|
group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE)
|
||||||
|
discussion_anova_results
|
||||||
|
|
||||||
|
# look at the representative comments for PC1 and PC2
|
||||||
|
top5 <- neurobiber_subcomment_pca_df %>%
|
||||||
|
filter(source=="c2") |>
|
||||||
|
arrange(desc(PC2)) %>%
|
||||||
|
slice(15:30) %>%
|
||||||
|
pull(text)
|
||||||
|
|
||||||
|
bottom5 <- neurobiber_subcomment_pca_df %>%
|
||||||
|
filter(source=="c2") |>
|
||||||
|
arrange(PC2) %>%
|
||||||
|
slice(15:30) %>%
|
||||||
|
pull(text)
|
||||||
|
|
||||||
|
cat("Top 15:30 comment_text by score:\n")
|
||||||
|
print(top5)
|
||||||
|
|
||||||
|
cat("\nBottom 15:30 comment_text by score:\n")
|
||||||
|
print(bottom5)
|
||||||
|
|
||||||
|
|
||||||
|
aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |>
|
||||||
|
group_by(AuthorWMFAffil, week_index, source, priority) %>%
|
||||||
|
summarise(mean_PC1 = median(PC1),
|
||||||
|
mean_PC2 = median(PC2),
|
||||||
|
mean_PC3 = median(PC3),
|
||||||
|
mean_PC4 = median(PC4),
|
||||||
|
mean_PC5 = median(PC5))
|
||||||
|
library(scales)
|
||||||
|
library(ggplot2)
|
||||||
|
|
||||||
|
|
||||||
|
affiliationColors <-
|
||||||
|
setNames( c('#5da2d8', '#c7756a')
|
||||||
|
,c("False", "True"))
|
||||||
|
|
||||||
|
|
||||||
|
long_df <- aggregated_neurobiber_description_pca_df %>%
|
||||||
|
tidyr::pivot_longer(
|
||||||
|
cols = starts_with("mean_PC"),
|
||||||
|
names_to = "PC",
|
||||||
|
values_to = "PC_value"
|
||||||
|
)
|
||||||
|
|
||||||
|
ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) +
|
||||||
|
geom_line(size = 1) +
|
||||||
|
facet_grid(PC ~ source, scales = "free_y") +
|
||||||
|
scale_color_manual(values = affiliationColors, name = "WMF Affiliation") +
|
||||||
|
scale_x_continuous(breaks = pretty_breaks()) +
|
||||||
|
scale_y_continuous(limits = c(-10, 10)) +
|
||||||
|
labs(x = "Week Index", y = "Mean PC Value",
|
||||||
|
title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") +
|
||||||
|
theme_minimal(base_size = 14) +
|
||||||
|
theme(legend.position = "top")
|
@ -1,147 +0,0 @@
|
|||||||
library(tidyverse)
|
|
||||||
|
|
||||||
neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv"
|
|
||||||
neurobiber_df <- read.csv(neurobiber_csv , header = TRUE)
|
|
||||||
|
|
||||||
normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE)
|
|
||||||
|
|
||||||
neurobiber_df$normalized_features_vec <- lapply(
|
|
||||||
asplit(neurobiber_df[, normalized_cols], 1), as.numeric
|
|
||||||
)
|
|
||||||
library(dplyr)
|
|
||||||
# duplicate, declined, invalid -> declined
|
|
||||||
# stalled, open, progress -> open
|
|
||||||
# resolved -> resolved
|
|
||||||
neurobiber_df <- neurobiber_df |>
|
|
||||||
filter(comment_type == "task_description") |>
|
|
||||||
mutate(
|
|
||||||
task_status = case_when(
|
|
||||||
status %in% c("duplicate", "declined", "invalid") ~ "declined",
|
|
||||||
status %in% c("stalled", "open", "progress") ~ "open",
|
|
||||||
status == "resolved" ~ "resolved",
|
|
||||||
TRUE ~ status # fallback for unexpected values
|
|
||||||
))
|
|
||||||
|
|
||||||
X <- do.call(rbind, neurobiber_df$normalized_features_vec)
|
|
||||||
|
|
||||||
library(coop)
|
|
||||||
#cos_sim1 <- coop::cosine(t(X))
|
|
||||||
|
|
||||||
|
|
||||||
register_means <- aggregate(
|
|
||||||
X,
|
|
||||||
by = list(
|
|
||||||
outcome= neurobiber_df$task_status,
|
|
||||||
source = neurobiber_df$source,
|
|
||||||
affiliation = neurobiber_df$AuthorWMFAffil
|
|
||||||
),
|
|
||||||
FUN = mean
|
|
||||||
)
|
|
||||||
|
|
||||||
feature_mat <- as.matrix(register_means[, -(1:3)])
|
|
||||||
cos_sim_matrix <- coop::cosine(t(feature_mat))
|
|
||||||
rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_")
|
|
||||||
colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
|
|
||||||
|
|
||||||
#finding the most dissimilar pairs
|
|
||||||
|
|
||||||
compare_feature_vectors <- function(
|
|
||||||
pair1, pair2,
|
|
||||||
cos_sim_matrix,
|
|
||||||
feature_mat,
|
|
||||||
normalized_cols,
|
|
||||||
top_n = 5
|
|
||||||
) {
|
|
||||||
# Allow for both index and name input
|
|
||||||
if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1
|
|
||||||
if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2
|
|
||||||
|
|
||||||
# Get feature vectors
|
|
||||||
vec1 <- feature_mat[row_idx, ]
|
|
||||||
vec2 <- feature_mat[col_idx, ]
|
|
||||||
|
|
||||||
# Feature-wise absolute differences
|
|
||||||
feature_diff <- abs(vec1 - vec2)
|
|
||||||
top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n]
|
|
||||||
top_features <- names(feature_diff)[top_features_idx]
|
|
||||||
top_diffs <- feature_diff[top_features_idx]
|
|
||||||
|
|
||||||
# Map Vxx to normalized column names
|
|
||||||
feature_nums <- as.integer(sub("V", "", top_features))
|
|
||||||
feature_colnames <- normalized_cols[feature_nums]
|
|
||||||
|
|
||||||
# Determine which vector is larger for each feature
|
|
||||||
larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx],
|
|
||||||
rownames(cos_sim_matrix)[row_idx],
|
|
||||||
colnames(cos_sim_matrix)[col_idx])
|
|
||||||
|
|
||||||
# Assemble results
|
|
||||||
top_features_df <- data.frame(
|
|
||||||
feature = top_features,
|
|
||||||
normalized_colname = feature_colnames,
|
|
||||||
vec1_value = vec1[top_features_idx],
|
|
||||||
vec2_value = vec2[top_features_idx],
|
|
||||||
abs_difference = top_diffs,
|
|
||||||
larger_in = larger_in
|
|
||||||
)
|
|
||||||
|
|
||||||
# Print pair and return
|
|
||||||
cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n")
|
|
||||||
print(top_features_df)
|
|
||||||
invisible(top_features_df)
|
|
||||||
}
|
|
||||||
|
|
||||||
compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#plotting stuff beneath here
|
|
||||||
annotation_row <- data.frame(
|
|
||||||
affiliation = register_means$affiliation,
|
|
||||||
source = register_means$source
|
|
||||||
)
|
|
||||||
rownames(annotation_row) <- rownames(cos_sim_matrix)
|
|
||||||
|
|
||||||
annotation_col <- data.frame(
|
|
||||||
affiliation = register_means$affiliation,
|
|
||||||
source = register_means$source
|
|
||||||
)
|
|
||||||
rownames(annotation_col) <- colnames(cos_sim_matrix)
|
|
||||||
|
|
||||||
annotation_row <- annotation_row |>
|
|
||||||
mutate(affil = case_when(
|
|
||||||
affiliation == "True" ~ "WMF",
|
|
||||||
affiliation == "False" ~ "non-WMF"
|
|
||||||
)) |> select(-affiliation)
|
|
||||||
|
|
||||||
annotation_col <- annotation_col |>
|
|
||||||
mutate(affil = case_when(
|
|
||||||
affiliation == "True" ~ "WMF",
|
|
||||||
affiliation == "False" ~ "non-WMF"
|
|
||||||
)) |> select(-affiliation)
|
|
||||||
|
|
||||||
|
|
||||||
my_annotation_colors = list(
|
|
||||||
affil = c("WMF" = "green", "non-WMF" = "purple"),
|
|
||||||
source = c(c1 = "lightgrey", c2 = "grey", c3 = "black")
|
|
||||||
)
|
|
||||||
|
|
||||||
cos_sim_matrix[lower.tri(cos_sim_matrix)] <- NA
|
|
||||||
#pheatmap(scaled_mat, symm = TRUE)
|
|
||||||
#heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257))
|
|
||||||
library(viridis)
|
|
||||||
library(pheatmap)
|
|
||||||
fossy_heatmap <- pheatmap(cos_sim_matrix,
|
|
||||||
cluster_rows = FALSE,
|
|
||||||
cluster_cols = FALSE,
|
|
||||||
scale='none',
|
|
||||||
annotation_row = annotation_row,
|
|
||||||
annotation_col = annotation_col,
|
|
||||||
annotation_colors = my_annotation_colors,
|
|
||||||
na_col = "white")
|
|
||||||
|
|
||||||
#ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800)
|
|
||||||
|
|
||||||
#diag(cos_sim_matrix) <- NA
|
|
||||||
#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar
|
|
||||||
#which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Least similar
|
|
Loading…
Reference in New Issue
Block a user