backing up renewed PCA analysis

2025-10-08 14:55:31 -07:00 · 2025-10-08 14:55:31 -07:00 · 186a26f261
commit 186a26f261
parent 840b32a2e4
2 changed files with 42 additions and 21 deletions
--- a/analysis_data/data_verification_2.R
+++ b/analysis_data/data_verification_2.R
@ -0,0 +1,13 @@
+library(tidyverse)
+library(stringr)
+library(tidyr)
+library(dplyr)
+library(purrr)
+
+main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+duplicates <- main_df[duplicated(main_df[, c("comment_text", "TaskPHID", "AuthorPHID")]) |
+                   duplicated(main_df[, c("comment_text", "TaskPHID", "AuthorPHID")], fromLast = TRUE), ]
+pulling <- main_df |>
+  filter(id == "24695" | id == "24696")
--- a/p2/quest/neurobiber_PCA_analysis.R
+++ b/p2/quest/neurobiber_PCA_analysis.R
@ -1,11 +1,26 @@
 library(tidyverse)

 neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
-neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE) 
+neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv  , header = TRUE)  |> mutate(comment_text = text)

 neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
-neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE)
+neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv  , header = TRUE) |> mutate(comment_text = text)

+main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
+main_df <- read.csv(main_csv  , header = TRUE)
+
+main_df <- main_df |>
+  select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title)
+
+# Join main_df to neurobiber_description_pca_df
+description_joined <- main_df |>
+  right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
+  filter(comment_text != "nan") #TODO: look at this more in depth
+
+# Join main_df to neurobiber_subcomment_pca_df
+subcomment_joined <- main_df |>
+  right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
+  filter(comment_text != "nan") #TODO: look at this more in depth

 preprocess_comment <- function(message) {
  library(stringr)
@ -38,7 +53,7 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment"
 neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
 neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)

-neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>%
+subcomment_joined <- subcomment_joined %>%
  mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
                                  paste(neurobiber_description_pca_df$AuthorPHID,
                                        neurobiber_description_pca_df$TaskPHID)))
@ -81,31 +96,24 @@ affiliationColors <-
  setNames( c('#5da2d8', '#c7756a')
            ,c("False", "True"))

+subcomment_joined_no_gerrit <- subcomment_joined |>
+  filter(isGerritBot != "TRUE")

-neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>%
-  left_join(
-    neurobiber_description_pca_df %>%
-      select(TaskPHID, priority),
-    by = "TaskPHID"
-  ) 

-neurobiber_description_pca_df_x <- neurobiber_description_pca_df |>
-  filter(priority %in% c("Lowest","Unbreak Now!"))
 #unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
 #unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
 # geom_point(shape = 21, alpha=0.4, size=2) +
 # geom_bin_2d() +
-ggplot(neurobiber_descriptions_pca_df, aes(x = PC4, y = PC1, fill = pair_in_description)) +
-  facet_grid(source ~ phase, scales="fixed") +
-  geom_point(shape = 21, alpha=0.1, size=2) +
-  geom_smooth() +
-  xlim(-5, 5) + 
-  ylim(-5, 5) +
-  scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
+ggplot(subcomment_joined_no_gerrit, aes(x = PC2, y = PC1, fill = isAuthorWMF)) +
+  facet_grid(source ~ pair_in_description, scales="fixed") +
+  geom_point(shape = 21, alpha=0.3, size=2) +
+  xlim(-15, 15) + 
+  ylim(-15, 15) +
+  scale_fill_viridis_d() + 
  theme_minimal() +
  labs(
-    title = "PCs for Task Comments (Faceted by Source and Phase)",
-    x = "PC4",
+    title = "PCs for Task Comments (Faceted by source and pair_in_description)",
+    x = "PC2",
    y = "PC1",
-    fill = "author_same_as_task_creator?"
+    fill = "isAuthorWMF?"
  )