diff --git a/073125-fossy-tasks-resolved.png b/artifact-figures/073125-fossy-tasks-resolved.png similarity index 100% rename from 073125-fossy-tasks-resolved.png rename to artifact-figures/073125-fossy-tasks-resolved.png diff --git a/073125_FOSSY_comm_heatmap.png b/artifact-figures/073125_FOSSY_comm_heatmap.png similarity index 100% rename from 073125_FOSSY_comm_heatmap.png rename to artifact-figures/073125_FOSSY_comm_heatmap.png diff --git a/affiliation_heatmap_fossy_plot.png b/artifact-figures/affiliation_heatmap_fossy_plot.png similarity index 100% rename from affiliation_heatmap_fossy_plot.png rename to artifact-figures/affiliation_heatmap_fossy_plot.png diff --git a/mgaughan-rstudio-server_27851458.out b/mgaughan-rstudio-server_27851458.out deleted file mode 100644 index 79aae66..0000000 --- a/mgaughan-rstudio-server_27851458.out +++ /dev/null @@ -1,17 +0,0 @@ -1. SSH tunnel from your workstation using the following command: - - ssh -N -L 8787:n3441:59491 mjilg@klone.hyak.uw.edu - - and point your web browser to http://localhost:8787 - -2. log in to RStudio Server using the following credentials: - - user: mjilg - password: QSTMw7+SdHBKq8hU9/1q - -When done using RStudio Server, terminate the job by: - -1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) -2. Issue the following command on the login node: - - scancel -f 27851458 diff --git a/p2/gerrit_script.R b/p2/gerrit_script.R new file mode 100644 index 0000000..4d19aa2 --- /dev/null +++ b/p2/gerrit_script.R @@ -0,0 +1,14 @@ +library(tidyverse) + +gerrit_csv <-"/gscratch/comdata/users/mjilg/mw-repo-lifecycles/080425_gerrit_filled_df.csv" +gerrit_df <- read.csv(gerrit_csv , header = TRUE) + +library(dplyr) +messages_with_urls <- gerrit_df |> + dplyr::filter(`gerrit_change_urls` != "[]") +library(stringr) +incomplete_data <- messages_with_urls |> + dplyr::filter(str_detect(gerrit_full_results, "'full_result': None\\}")) + +maybe_incomplete_data <- messages_with_urls |> + dplyr::filter(str_detect(gerrit_full_results, "None")) diff --git a/p2/p2_EDA/080425_population_EDA.R b/p2/p2_EDA/080425_population_EDA.R new file mode 100644 index 0000000..be5925a --- /dev/null +++ b/p2/p2_EDA/080425_population_EDA.R @@ -0,0 +1,12 @@ +library(tidyverse) + +neurobiber_csv <-"~/p2/071425_master_discussion_data.csv" +neurobiber_df <- read.csv(neurobiber_csv , header = TRUE) + +unique_authors <- unique(neurobiber_df$AuthorPHID) +unique_authors_df <- neurobiber_df[!duplicated(neurobiber_df$AuthorPHID), ] +table(unique_authors_df$AuthorWMFAffil) + +task_description_unique_authors <- unique_authors_df |> + filter(comment_type == "task_description") +table(task_description_unique_authors$AuthorWMFAffil) diff --git a/p2/quest/neurobiber_cosine.R b/p2/quest/neurobiber_cosine.R index 15cb004..ff7badf 100644 --- a/p2/quest/neurobiber_cosine.R +++ b/p2/quest/neurobiber_cosine.R @@ -43,7 +43,59 @@ cos_sim_matrix <- coop::cosine(t(feature_mat)) rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_") colnames(cos_sim_matrix) <- rownames(cos_sim_matrix) +#finding the most dissimilar pairs +compare_feature_vectors <- function( + pair1, pair2, + cos_sim_matrix, + feature_mat, + normalized_cols, + top_n = 5 +) { + # Allow for both index and name input + if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1 + if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2 + + # Get feature vectors + vec1 <- feature_mat[row_idx, ] + vec2 <- feature_mat[col_idx, ] + + # Feature-wise absolute differences + feature_diff <- abs(vec1 - vec2) + top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n] + top_features <- names(feature_diff)[top_features_idx] + top_diffs <- feature_diff[top_features_idx] + + # Map Vxx to normalized column names + feature_nums <- as.integer(sub("V", "", top_features)) + feature_colnames <- normalized_cols[feature_nums] + + # Determine which vector is larger for each feature + larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx], + rownames(cos_sim_matrix)[row_idx], + colnames(cos_sim_matrix)[col_idx]) + + # Assemble results + top_features_df <- data.frame( + feature = top_features, + normalized_colname = feature_colnames, + vec1_value = vec1[top_features_idx], + vec2_value = vec2[top_features_idx], + abs_difference = top_diffs, + larger_in = larger_in + ) + + # Print pair and return + cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n") + print(top_features_df) + invisible(top_features_df) +} + +compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10) + + + +#plotting stuff beneath here annotation_row <- data.frame( affiliation = register_means$affiliation, source = register_means$source @@ -88,7 +140,7 @@ fossy_heatmap <- pheatmap(cos_sim_matrix, annotation_colors = my_annotation_colors, na_col = "white") -ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800) +#ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800) #diag(cos_sim_matrix) <- NA #which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar