backing up renewed PCA analysis
This commit is contained in:
parent
840b32a2e4
commit
186a26f261
13
analysis_data/data_verification_2.R
Normal file
13
analysis_data/data_verification_2.R
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
library(tidyverse)
|
||||||
|
library(stringr)
|
||||||
|
library(tidyr)
|
||||||
|
library(dplyr)
|
||||||
|
library(purrr)
|
||||||
|
|
||||||
|
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||||
|
main_df <- read.csv(main_csv, header = TRUE)
|
||||||
|
|
||||||
|
duplicates <- main_df[duplicated(main_df[, c("comment_text", "TaskPHID", "AuthorPHID")]) |
|
||||||
|
duplicated(main_df[, c("comment_text", "TaskPHID", "AuthorPHID")], fromLast = TRUE), ]
|
||||||
|
pulling <- main_df |>
|
||||||
|
filter(id == "24695" | id == "24696")
|
||||||
@ -1,11 +1,26 @@
|
|||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
|
|
||||||
neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
|
neurobiber_description_pca_csv <-"~/p2/quest/100125_description_PCA_df.csv"
|
||||||
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE)
|
neurobiber_description_pca_df <- read.csv(neurobiber_description_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
|
neurobiber_subcomment_pca_csv <-"~/p2/quest/100125_subcomment_PCA_df.csv"
|
||||||
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE)
|
neurobiber_subcomment_pca_df <- read.csv(neurobiber_subcomment_pca_csv , header = TRUE) |> mutate(comment_text = text)
|
||||||
|
|
||||||
|
main_csv <- "~/analysis_data/100625_unified_w_affil.csv"
|
||||||
|
main_df <- read.csv(main_csv , header = TRUE)
|
||||||
|
|
||||||
|
main_df <- main_df |>
|
||||||
|
select(TaskPHID, AuthorPHID, date_created, comment_text, isAuthorWMF, isGerritBot, resolution_outcome, task_title)
|
||||||
|
|
||||||
|
# Join main_df to neurobiber_description_pca_df
|
||||||
|
description_joined <- main_df |>
|
||||||
|
right_join(neurobiber_description_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
||||||
|
filter(comment_text != "nan") #TODO: look at this more in depth
|
||||||
|
|
||||||
|
# Join main_df to neurobiber_subcomment_pca_df
|
||||||
|
subcomment_joined <- main_df |>
|
||||||
|
right_join(neurobiber_subcomment_pca_df, by = c("TaskPHID", "AuthorPHID", "date_created", "comment_text")) |>
|
||||||
|
filter(comment_text != "nan") #TODO: look at this more in depth
|
||||||
|
|
||||||
preprocess_comment <- function(message) {
|
preprocess_comment <- function(message) {
|
||||||
library(stringr)
|
library(stringr)
|
||||||
@ -38,7 +53,7 @@ neurobiber_subcomment_pca_df$comment_type <- "subcomment"
|
|||||||
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
|
neurobiber_description_pca_df$cleaned_comment <- sapply(neurobiber_description_pca_df$text, preprocess_comment)
|
||||||
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
|
neurobiber_subcomment_pca_df$cleaned_comment <- sapply(neurobiber_subcomment_pca_df$text, preprocess_comment)
|
||||||
|
|
||||||
neurobiber_subcomment_pca_df <- neurobiber_subcomment_pca_df %>%
|
subcomment_joined <- subcomment_joined %>%
|
||||||
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
|
mutate(pair_in_description = (paste(AuthorPHID, TaskPHID) %in%
|
||||||
paste(neurobiber_description_pca_df$AuthorPHID,
|
paste(neurobiber_description_pca_df$AuthorPHID,
|
||||||
neurobiber_description_pca_df$TaskPHID)))
|
neurobiber_description_pca_df$TaskPHID)))
|
||||||
@ -81,31 +96,24 @@ affiliationColors <-
|
|||||||
setNames( c('#5da2d8', '#c7756a')
|
setNames( c('#5da2d8', '#c7756a')
|
||||||
,c("False", "True"))
|
,c("False", "True"))
|
||||||
|
|
||||||
|
subcomment_joined_no_gerrit <- subcomment_joined |>
|
||||||
|
filter(isGerritBot != "TRUE")
|
||||||
|
|
||||||
neurobiber_subcomment_pca_df_x <- neurobiber_subcomment_pca_df %>%
|
|
||||||
left_join(
|
|
||||||
neurobiber_description_pca_df %>%
|
|
||||||
select(TaskPHID, priority),
|
|
||||||
by = "TaskPHID"
|
|
||||||
)
|
|
||||||
|
|
||||||
neurobiber_description_pca_df_x <- neurobiber_description_pca_df |>
|
|
||||||
filter(priority %in% c("Lowest","Unbreak Now!"))
|
|
||||||
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
#unified_df$AuthorWMFAffil <- factor(unified_df$AuthorWMFAffil, levels = c("False", "True"))
|
||||||
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
#unified_df <- unified_df[order(unified_df$AuthorWMFAffil), ]
|
||||||
# geom_point(shape = 21, alpha=0.4, size=2) +
|
# geom_point(shape = 21, alpha=0.4, size=2) +
|
||||||
# geom_bin_2d() +
|
# geom_bin_2d() +
|
||||||
ggplot(neurobiber_descriptions_pca_df, aes(x = PC4, y = PC1, fill = pair_in_description)) +
|
ggplot(subcomment_joined_no_gerrit, aes(x = PC2, y = PC1, fill = isAuthorWMF)) +
|
||||||
facet_grid(source ~ phase, scales="fixed") +
|
facet_grid(source ~ pair_in_description, scales="fixed") +
|
||||||
geom_point(shape = 21, alpha=0.1, size=2) +
|
geom_point(shape = 21, alpha=0.3, size=2) +
|
||||||
geom_smooth() +
|
xlim(-15, 15) +
|
||||||
xlim(-5, 5) +
|
ylim(-15, 15) +
|
||||||
ylim(-5, 5) +
|
scale_fill_viridis_d() +
|
||||||
scale_fill_viridis_d() + # Or scale_fill_brewer/palette of your choice
|
|
||||||
theme_minimal() +
|
theme_minimal() +
|
||||||
labs(
|
labs(
|
||||||
title = "PCs for Task Comments (Faceted by Source and Phase)",
|
title = "PCs for Task Comments (Faceted by source and pair_in_description)",
|
||||||
x = "PC4",
|
x = "PC2",
|
||||||
y = "PC1",
|
y = "PC1",
|
||||||
fill = "author_same_as_task_creator?"
|
fill = "isAuthorWMF?"
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user