diff --git a/mgaughan-rstudio-server_28911380.out b/mgaughan-rstudio-server_28911380.out new file mode 100644 index 0000000..c7b1110 --- /dev/null +++ b/mgaughan-rstudio-server_28911380.out @@ -0,0 +1,18 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3441:47269 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: 9Qgk9UkRdmKalTKyDmH4 + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 28911380 +[2025-09-05T14:55:26.103] error: *** JOB 28911380 ON n3441 CANCELLED AT 2025-09-05T14:55:26 DUE TO TIME LIMIT *** diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R new file mode 100644 index 0000000..b547d17 --- /dev/null +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -0,0 +1,74 @@ +library(tidyverse) + +neurobiber_description_pca_csv <-"~/p2/quest/090425_description_PCA_df.csv" +neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) + +neurobiber_subcomment_pca_csv <-"~/p2/quest/090425_subcomment_PCA_df.csv" +neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) + + +# look at correlation between PC1, PC2, and different outcome variables +library(dplyr) +description_anova_results <- neurobiber_description_pca_df %>% + group_by(source) %>% + group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) +description_anova_results + +discussion_anova_results <- neurobiber_subcomment_pca_df %>% + group_by(source) %>% + group_map(~ summary(aov(PC2 ~ phase, data = .x)), .keep = TRUE) +discussion_anova_results + +# look at the representative comments for PC1 and PC2 +top5 <- neurobiber_subcomment_pca_df %>% + filter(source=="c2") |> + arrange(desc(PC2)) %>% + slice(15:30) %>% + pull(text) + +bottom5 <- neurobiber_subcomment_pca_df %>% + filter(source=="c2") |> + arrange(PC2) %>% + slice(15:30) %>% + pull(text) + +cat("Top 15:30 comment_text by score:\n") +print(top5) + +cat("\nBottom 15:30 comment_text by score:\n") +print(bottom5) + + +aggregated_neurobiber_description_pca_df <- neurobiber_description_pca_df |> + group_by(AuthorWMFAffil, week_index, source, priority) %>% + summarise(mean_PC1 = median(PC1), + mean_PC2 = median(PC2), + mean_PC3 = median(PC3), + mean_PC4 = median(PC4), + mean_PC5 = median(PC5)) +library(scales) +library(ggplot2) + + +affiliationColors <- + setNames( c('#5da2d8', '#c7756a') + ,c("False", "True")) + + +long_df <- aggregated_neurobiber_description_pca_df %>% + tidyr::pivot_longer( + cols = starts_with("mean_PC"), + names_to = "PC", + values_to = "PC_value" + ) + +ggplot(long_df, aes(x = week_index, y = PC_value, color = AuthorWMFAffil, group = AuthorWMFAffil)) + + geom_line(size = 1) + + facet_grid(PC ~ source, scales = "free_y") + + scale_color_manual(values = affiliationColors, name = "WMF Affiliation") + + scale_x_continuous(breaks = pretty_breaks()) + + scale_y_continuous(limits = c(-10, 10)) + + labs(x = "Week Index", y = "Mean PC Value", + title = "Weekly Median PC Values by Source and PC, Colored by WMF Affiliation") + + theme_minimal(base_size = 14) + + theme(legend.position = "top") \ No newline at end of file diff --git a/p2/quest/neurobiber_cosine.R b/p2/quest/neurobiber_cosine.R deleted file mode 100644 index ff7badf..0000000 --- a/p2/quest/neurobiber_cosine.R +++ /dev/null @@ -1,147 +0,0 @@ -library(tidyverse) - -neurobiber_csv <-"~/p2/quest/072525_pp_biberplus_labels.csv" -neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) - -normalized_cols <- grep("^normalized_", names(neurobiber_df), value = TRUE) - -neurobiber_df$normalized_features_vec <- lapply( - asplit(neurobiber_df[, normalized_cols], 1), as.numeric -) -library(dplyr) -# duplicate, declined, invalid -> declined -# stalled, open, progress -> open -# resolved -> resolved -neurobiber_df <- neurobiber_df |> - filter(comment_type == "task_description") |> - mutate( - task_status = case_when( - status %in% c("duplicate", "declined", "invalid") ~ "declined", - status %in% c("stalled", "open", "progress") ~ "open", - status == "resolved" ~ "resolved", - TRUE ~ status # fallback for unexpected values - )) - -X <- do.call(rbind, neurobiber_df$normalized_features_vec) - -library(coop) -#cos_sim1 <- coop::cosine(t(X)) - - -register_means <- aggregate( - X, - by = list( - outcome= neurobiber_df$task_status, - source = neurobiber_df$source, - affiliation = neurobiber_df$AuthorWMFAffil - ), - FUN = mean -) - -feature_mat <- as.matrix(register_means[, -(1:3)]) -cos_sim_matrix <- coop::cosine(t(feature_mat)) -rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_") -colnames(cos_sim_matrix) <- rownames(cos_sim_matrix) - -#finding the most dissimilar pairs - -compare_feature_vectors <- function( - pair1, pair2, - cos_sim_matrix, - feature_mat, - normalized_cols, - top_n = 5 -) { - # Allow for both index and name input - if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1 - if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2 - - # Get feature vectors - vec1 <- feature_mat[row_idx, ] - vec2 <- feature_mat[col_idx, ] - - # Feature-wise absolute differences - feature_diff <- abs(vec1 - vec2) - top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n] - top_features <- names(feature_diff)[top_features_idx] - top_diffs <- feature_diff[top_features_idx] - - # Map Vxx to normalized column names - feature_nums <- as.integer(sub("V", "", top_features)) - feature_colnames <- normalized_cols[feature_nums] - - # Determine which vector is larger for each feature - larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx], - rownames(cos_sim_matrix)[row_idx], - colnames(cos_sim_matrix)[col_idx]) - - # Assemble results - top_features_df <- data.frame( - feature = top_features, - normalized_colname = feature_colnames, - vec1_value = vec1[top_features_idx], - vec2_value = vec2[top_features_idx], - abs_difference = top_diffs, - larger_in = larger_in - ) - - # Print pair and return - cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n") - print(top_features_df) - invisible(top_features_df) -} - -compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10) - - - -#plotting stuff beneath here -annotation_row <- data.frame( - affiliation = register_means$affiliation, - source = register_means$source -) -rownames(annotation_row) <- rownames(cos_sim_matrix) - -annotation_col <- data.frame( - affiliation = register_means$affiliation, - source = register_means$source -) -rownames(annotation_col) <- colnames(cos_sim_matrix) - -annotation_row <- annotation_row |> - mutate(affil = case_when( - affiliation == "True" ~ "WMF", - affiliation == "False" ~ "non-WMF" - )) |> select(-affiliation) - -annotation_col <- annotation_col |> - mutate(affil = case_when( - affiliation == "True" ~ "WMF", - affiliation == "False" ~ "non-WMF" - )) |> select(-affiliation) - - -my_annotation_colors = list( - affil = c("WMF" = "green", "non-WMF" = "purple"), - source = c(c1 = "lightgrey", c2 = "grey", c3 = "black") -) - -cos_sim_matrix[lower.tri(cos_sim_matrix)] <- NA -#pheatmap(scaled_mat, symm = TRUE) -#heatmap(cos_sim_matrix, col=heat.colors(256), breaks=seq(-1, 1, length.out=257)) -library(viridis) -library(pheatmap) -fossy_heatmap <- pheatmap(cos_sim_matrix, - cluster_rows = FALSE, - cluster_cols = FALSE, - scale='none', - annotation_row = annotation_row, - annotation_col = annotation_col, - annotation_colors = my_annotation_colors, - na_col = "white") - -#ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800) - -#diag(cos_sim_matrix) <- NA -#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar -#which(cos_sim_matrix == min(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Least similar