From a0545ad8de5f7d473d0b901d68efa93b8df5113a Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Wed, 26 Nov 2025 13:10:20 -0800 Subject: [PATCH] adding small updates to results scripts --- dsl/dsl.R | 6 +++--- p2/quest/neurobiber_PCA_analysis.R | 28 ++++++++++++---------------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/dsl/dsl.R b/dsl/dsl.R index 119fb94..5bb800c 100644 --- a/dsl/dsl.R +++ b/dsl/dsl.R @@ -68,10 +68,10 @@ summary(felm_model) dev_model <- dsl( model = "logit", - formula = task_resolution ~ as.factor(isAuthorWMF) * as.factor(source) * human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + median_PC4_adac + median_PC3_adac + n_comments_before + median_gerrit_reviewers + median_gerrit_loc_delta - + week_index, + + week_index + as.factor(isAuthorWMF) * as.factor(source), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", @@ -105,7 +105,7 @@ ggplot(coef_df, aes(x = estimate, y = term)) + geom_point(size = 1) + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + - labs(title = "Fixed Effects Model Coefficients", + labs(title = "DSL Logit Model Coefficients", x = "Coefficient Estimate", y = "Variable") + theme_minimal() diff --git a/p2/quest/neurobiber_PCA_analysis.R b/p2/quest/neurobiber_PCA_analysis.R index 8ec6522..1689783 100644 --- a/p2/quest/neurobiber_PCA_analysis.R +++ b/p2/quest/neurobiber_PCA_analysis.R @@ -9,12 +9,12 @@ library(dplyr) #pca_csv <- "~/p2/quest/102025_total_pca_df.csv" #pca_df <- read.csv(pca_csv , header = TRUE) |> mutate(comment_text = text) -main_csv <- "~/analysis_data/102725_unified.csv" +main_csv <- "~/analysis_data/110925_unified.csv" main_df <- read.csv(main_csv , header = TRUE) main_df <- main_df |> mutate( - comment_wordcount = as.integer(str_count(replace_na(as.character(comment_text), ""), "\\S+")) + comment_wordcount = as.integer(stringr::str_count(tidyr::replace_na(as.character(comment_text), ""), "\\S+")) ) @@ -25,24 +25,20 @@ description_df <- main_df |> replies_df <- main_df |> filter(comment_type == "task_subcomment") |> - filter(isGerritBot != TRUE) |> - left_join( - description_df, - by="TaskPHID" - ) + filter(isGerritBot != TRUE) - -ggplot(replies_df, aes(x = autho, y = PC3, fill = comment_type)) + - facet_grid(source~phase, scales="fixed") + - geom_point(shape = 21, alpha=0.3, size=2) + - xlim(-30, 30) + - ylim(-30, 30) + +library(ggplot2) +ggplot(replies_df, aes(x = PC3, y = PC4, fill = isAuthorWMF)) + + facet_grid(ADAC~source, scales="fixed") + + geom_point(shape = 21, alpha=0.15, size=3) + + xlim(-50, 50) + + ylim(-50, 50) + scale_fill_viridis_d() + theme_minimal() + labs( - title = "PCs for Task Comments (Faceted by source and phase)", - x = "PC4", - y = "PC3", + title = "PCs for Task Comments (Faceted by source (column))", + x = "PC3", + y = "PC4", )