diff --git a/analysis_data/data_verification_2.R b/analysis_data/data_verification_2.R new file mode 100644 index 0000000..c8eb5ac --- /dev/null +++ b/analysis_data/data_verification_2.R @@ -0,0 +1,13 @@ +library(tidyverse) +library(stringr) +library(tidyr) +library(dplyr) +library(purrr) + +main_csv <- "~/analysis_data/100625_unified_w_affil.csv" +main_df <- read.csv(main_csv, header = TRUE) + +duplicates <- main_df[duplicated(main_df[, c("comment_text", "TaskPHID", "AuthorPHID")]) | + duplicated(main_df[, c("comment_text", "TaskPHID", "AuthorPHID")], fromLast = TRUE), ] +pulling <- main_df |> + filter(id == "24695" | id == "24696") diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R index df7a7ec..8daf6ef 100644 --- a/p2/quest/neurobiber_PCA_analysis.R +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -1,11 +1,26 @@ library(tidyverse) neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv" -neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) +neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text) neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv" -neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) +neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text) +main_csv <- "~/analysis_data/100625_unified_w_affil.csv" +main_df <- read.csv(main_csv , header = TRUE) + +main_df <- main_df |> + select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title) + +# Join main_df to neurobiber_description_pca_df +description_joined <- main_df |> + right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |> + filter(comment_text != "nan") #TODO: look at this more in depth + +# Join main_df to neurobiber_subcomment_pca_df +subcomment_joined <- main_df |> + right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |> + filter(comment_text != "nan") #TODO: look at this more in depth preprocess_comment <- function(message) { library(stringr) @@ -38,7 +53,7 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment" neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment) neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment) -neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>% +subcomment_joined <- subcomment_joined %>% mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in% paste(neurobiber_description_pca_df$AuthorPHID, neurobiber_description_pca_df$TaskPHID))) @@ -81,31 +96,24 @@ affiliationColors <- setNames( c('#5da2d8', '#c7756a') ,c("False", "True")) +subcomment_joined_no_gerrit <- subcomment_joined |> + filter(isGerritBot != "TRUE") -neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>% - left_join( - neurobiber_description_pca_df %>% - select(TaskPHID, priority), - by = "TaskPHID" - ) -neurobiber_description_pca_df_x <- neurobiber_description_pca_df |> - filter(priority %in% c("Lowest","Unbreak Now!")) #unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True")) #unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ] # geom_point(shape = 21, alpha=0.4, size=2) + # geom_bin_2d() + -ggplot(neurobiber_descriptions_pca_df, aes(x = PC4, y = PC1, fill = pair_in_description)) + - facet_grid(source ~ phase, scales="fixed") + - geom_point(shape = 21, alpha=0.1, size=2) + - geom_smooth() + - xlim(-5, 5) + - ylim(-5, 5) + - scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice +ggplot(subcomment_joined_no_gerrit, aes(x = PC2, y = PC1, fill = isAuthorWMF)) + + facet_grid(source ~ pair_in_description, scales="fixed") + + geom_point(shape = 21, alpha=0.3, size=2) + + xlim(-15, 15) + + ylim(-15, 15) + + scale_fill_viridis_d() + theme_minimal() + labs( - title = "PCs for Task Comments (Faceted by Source and Phase)", - x = "PC4", + title = "PCs for Task Comments (Faceted by source and pair_in_description)", + x = "PC2", y = "PC1", - fill = "author_same_as_task_creator?" + fill = "isAuthorWMF?" )