updating with gerrit information now
This commit is contained in:
parent
41de0cbc7a
commit
1c709f9a69
Before Width: | Height: | Size: 875 KiB After Width: | Height: | Size: 875 KiB |
Before Width: | Height: | Size: 750 KiB After Width: | Height: | Size: 750 KiB |
Before Width: | Height: | Size: 65 KiB After Width: | Height: | Size: 65 KiB |
@ -1,17 +0,0 @@
|
|||||||
1. SSH tunnel from your workstation using the following command:
|
|
||||||
|
|
||||||
ssh -N -L 8787:n3441:59491 mjilg@klone.hyak.uw.edu
|
|
||||||
|
|
||||||
and point your web browser to http://localhost:8787
|
|
||||||
|
|
||||||
2. log in to RStudio Server using the following credentials:
|
|
||||||
|
|
||||||
user: mjilg
|
|
||||||
password: QSTMw7+SdHBKq8hU9/1q
|
|
||||||
|
|
||||||
When done using RStudio Server, terminate the job by:
|
|
||||||
|
|
||||||
1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window)
|
|
||||||
2. Issue the following command on the login node:
|
|
||||||
|
|
||||||
scancel -f 27851458
|
|
14
p2/gerrit_script.R
Normal file
14
p2/gerrit_script.R
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
gerrit_csv <-"/gscratch/comdata/users/mjilg/mw-repo-lifecycles/080425_gerrit_filled_df.csv"
|
||||||
|
gerrit_df <- read.csv(gerrit_csv , header = TRUE)
|
||||||
|
|
||||||
|
library(dplyr)
|
||||||
|
messages_with_urls <- gerrit_df |>
|
||||||
|
dplyr::filter(`gerrit_change_urls` != "[]")
|
||||||
|
library(stringr)
|
||||||
|
incomplete_data <- messages_with_urls |>
|
||||||
|
dplyr::filter(str_detect(gerrit_full_results, "'full_result': None\\}"))
|
||||||
|
|
||||||
|
maybe_incomplete_data <- messages_with_urls |>
|
||||||
|
dplyr::filter(str_detect(gerrit_full_results, "None"))
|
12
p2/p2_EDA/080425_population_EDA.R
Normal file
12
p2/p2_EDA/080425_population_EDA.R
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
neurobiber_csv <-"~/p2/071425_master_discussion_data.csv"
|
||||||
|
neurobiber_df <- read.csv(neurobiber_csv , header = TRUE)
|
||||||
|
|
||||||
|
unique_authors <- unique(neurobiber_df$AuthorPHID)
|
||||||
|
unique_authors_df <- neurobiber_df[!duplicated(neurobiber_df$AuthorPHID), ]
|
||||||
|
table(unique_authors_df$AuthorWMFAffil)
|
||||||
|
|
||||||
|
task_description_unique_authors <- unique_authors_df |>
|
||||||
|
filter(comment_type == "task_description")
|
||||||
|
table(task_description_unique_authors$AuthorWMFAffil)
|
@ -43,7 +43,59 @@ cos_sim_matrix <- coop::cosine(t(feature_mat))
|
|||||||
rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_")
|
rownames(cos_sim_matrix) <- apply(register_means[, 1:3], 1, paste, collapse = "_")
|
||||||
colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
|
colnames(cos_sim_matrix) <- rownames(cos_sim_matrix)
|
||||||
|
|
||||||
|
#finding the most dissimilar pairs
|
||||||
|
|
||||||
|
compare_feature_vectors <- function(
|
||||||
|
pair1, pair2,
|
||||||
|
cos_sim_matrix,
|
||||||
|
feature_mat,
|
||||||
|
normalized_cols,
|
||||||
|
top_n = 5
|
||||||
|
) {
|
||||||
|
# Allow for both index and name input
|
||||||
|
if (is.character(pair1)) row_idx <- which(rownames(cos_sim_matrix) == pair1) else row_idx <- pair1
|
||||||
|
if (is.character(pair2)) col_idx <- which(colnames(cos_sim_matrix) == pair2) else col_idx <- pair2
|
||||||
|
|
||||||
|
# Get feature vectors
|
||||||
|
vec1 <- feature_mat[row_idx, ]
|
||||||
|
vec2 <- feature_mat[col_idx, ]
|
||||||
|
|
||||||
|
# Feature-wise absolute differences
|
||||||
|
feature_diff <- abs(vec1 - vec2)
|
||||||
|
top_features_idx <- order(feature_diff, decreasing = TRUE)[1:top_n]
|
||||||
|
top_features <- names(feature_diff)[top_features_idx]
|
||||||
|
top_diffs <- feature_diff[top_features_idx]
|
||||||
|
|
||||||
|
# Map Vxx to normalized column names
|
||||||
|
feature_nums <- as.integer(sub("V", "", top_features))
|
||||||
|
feature_colnames <- normalized_cols[feature_nums]
|
||||||
|
|
||||||
|
# Determine which vector is larger for each feature
|
||||||
|
larger_in <- ifelse(vec1[top_features_idx] > vec2[top_features_idx],
|
||||||
|
rownames(cos_sim_matrix)[row_idx],
|
||||||
|
colnames(cos_sim_matrix)[col_idx])
|
||||||
|
|
||||||
|
# Assemble results
|
||||||
|
top_features_df <- data.frame(
|
||||||
|
feature = top_features,
|
||||||
|
normalized_colname = feature_colnames,
|
||||||
|
vec1_value = vec1[top_features_idx],
|
||||||
|
vec2_value = vec2[top_features_idx],
|
||||||
|
abs_difference = top_diffs,
|
||||||
|
larger_in = larger_in
|
||||||
|
)
|
||||||
|
|
||||||
|
# Print pair and return
|
||||||
|
cat("Comparing:", rownames(cos_sim_matrix)[row_idx], "and", colnames(cos_sim_matrix)[col_idx], "\n")
|
||||||
|
print(top_features_df)
|
||||||
|
invisible(top_features_df)
|
||||||
|
}
|
||||||
|
|
||||||
|
compare_feature_vectors("resolved_c1_True", "resolved_c2_True", cos_sim_matrix, feature_mat, normalized_cols, top_n = 10)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#plotting stuff beneath here
|
||||||
annotation_row <- data.frame(
|
annotation_row <- data.frame(
|
||||||
affiliation = register_means$affiliation,
|
affiliation = register_means$affiliation,
|
||||||
source = register_means$source
|
source = register_means$source
|
||||||
@ -88,7 +140,7 @@ fossy_heatmap <- pheatmap(cos_sim_matrix,
|
|||||||
annotation_colors = my_annotation_colors,
|
annotation_colors = my_annotation_colors,
|
||||||
na_col = "white")
|
na_col = "white")
|
||||||
|
|
||||||
ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800)
|
#ggsave(filename = "073125_FOSSY_comm_heatmap.png", plot = fossy_heatmap, width = 9, height = 9, dpi = 800)
|
||||||
|
|
||||||
#diag(cos_sim_matrix) <- NA
|
#diag(cos_sim_matrix) <- NA
|
||||||
#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar
|
#which(cos_sim_matrix == max(cos_sim_matrix, na.rm = TRUE), arr.ind = TRUE) # Most similar
|
||||||
|
Loading…
Reference in New Issue
Block a user