1
0

updating with gerrit information now

This commit is contained in:
Matthew Gaughan 2025-08-07 19:03:20 -07:00
parent 41de0cbc7a
commit 1c709f9a69
7 changed files with 79 additions and 18 deletions

View File

Before

Width:  |  Height:  |  Size: 875 KiB

After

Width:  |  Height:  |  Size: 875 KiB

View File

Before

Width:  |  Height:  |  Size: 750 KiB

After

Width:  |  Height:  |  Size: 750 KiB

View File

Before

Width:  |  Height:  |  Size: 65 KiB

After

Width:  |  Height:  |  Size: 65 KiB

View File

@ -1,17 +0,0 @@
1. SSH tunnel from your workstation using the following command:
ssh -N -L 8787:n3441:59491 mjilg@klone.hyak.uw.edu
and point your web browser to http://localhost:8787
2. log in to RStudio Server using the following credentials:
user: mjilg
password: QSTMw7+SdHBKq8hU9/1q
When done using RStudio Server, terminate the job by:
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
2. Issue the following command on the login node:
scancel -f 27851458

14
p2/gerrit_script.R Normal file
View File

@ -0,0 +1,14 @@
library(tidyverse)
gerrit_csv <-"/gscratch/comdata/users/mjilg/mw-repo-lifecycles/080425_gerrit_filled_df.csv"
gerrit_df <- read.csv(gerrit_csv , header = TRUE)
library(dplyr)
messages_with_urls <- gerrit_df |>
dplyr::filter(`gerrit_change_urls` != "[]")
library(stringr)
incomplete_data <- messages_with_urls |>
dplyr::filter(str_detect(gerrit_full_results, "'full_result': None\\}"))
maybe_incomplete_data <- messages_with_urls |>
dplyr::filter(str_detect(gerrit_full_results, "None"))

View File

@ -0,0 +1,12 @@
library(tidyverse)
neurobiber_csv <-"~/p2/071425_master_discussion_data.csv"
neurobiber_df <- read.csv(neurobiber_csv , header = TRUE)
unique_authors <- unique(neurobiber_df$AuthorPHID)
unique_authors_df <- neurobiber_df[!duplicated(neurobiber_df$AuthorPHID), ]
table(unique_authors_df$AuthorWMFAffil)
task_description_unique_authors <- unique_authors_df |>
filter(comment_type == "task_description")
table(task_description_unique_authors$AuthorWMFAffil)

View File

@ -43,7 +43,59 @@ cos_sim_matrix <- coop::cosine(t(feature_mat))
rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_") rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_")
colnames(cos_sim_matrix) <- rownames(cos_sim_matrix) colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
#finding the most dissimilar pairs
compare_feature_vectors <- function(
pair1, pair2,
cos_sim_matrix,
feature_mat,
normalized_cols,
top_n = 5
) {
# Allow for both index and name input
if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1
if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2
# Get feature vectors
vec1 <- feature_mat[row_idx, ]
vec2 <- feature_mat[col_idx, ]
# Feature-wise absolute differences
feature_diff <- abs(vec1 - vec2)
top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n]
top_features <- names(feature_diff)[top_features_idx]
top_diffs <- feature_diff[top_features_idx]
# Map Vxx to normalized column names
feature_nums <- as.integer(sub("V", "", top_features))
feature_colnames <- normalized_cols[feature_nums]
# Determine which vector is larger for each feature
larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx],
rownames(cos_sim_matrix)[row_idx],
colnames(cos_sim_matrix)[col_idx])
# Assemble results
top_features_df <- data.frame(
feature = top_features,
normalized_colname = feature_colnames,
vec1_value = vec1[top_features_idx],
vec2_value = vec2[top_features_idx],
abs_difference = top_diffs,
larger_in = larger_in
)
# Print pair and return
cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n")
print(top_features_df)
invisible(top_features_df)
}
compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10)
#plotting stuff beneath here
annotation_row <- data.frame( annotation_row <- data.frame(
affiliation = register_means$affiliation, affiliation = register_means$affiliation,
source = register_means$source source = register_means$source
@ -88,7 +140,7 @@ fossy_heatmap <- pheatmap(cos_sim_matrix,
annotation_colors = my_annotation_colors, annotation_colors = my_annotation_colors,
na_col = "white") na_col = "white")
ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800) #ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800)
#diag(cos_sim_matrix) <- NA #diag(cos_sim_matrix) <- NA
#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar #which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar