From 6f2858dd72299ca34becae563108089980bee5fe Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Mon, 3 Nov 2025 10:04:42 -0800 Subject: [PATCH] updating for new bivariate plots --- analysis_data/100725_bivariate_plots.R | 2 +- analysis_data/style_dict_variables.R | 13 ++++++ dsl/dsl.R | 56 ++++++++++++++++++++++++++ dsl/final_bivariate.R | 45 +++++++++++++++++++++ dsl/survival.R | 8 ++-- mgaughan-rstudio-server_30651103.out | 17 ++++++++ 6 files changed, 136 insertions(+), 5 deletions(-) create mode 100644 dsl/dsl.R create mode 100644 dsl/final_bivariate.R create mode 100644 mgaughan-rstudio-server_30651103.out diff --git a/analysis_data/100725_bivariate_plots.R b/analysis_data/100725_bivariate_plots.R index e94d551..f76939c 100644 --- a/analysis_data/100725_bivariate_plots.R +++ b/analysis_data/100725_bivariate_plots.R @@ -15,7 +15,7 @@ library(ggdist) ggplot(main_df, aes(x = week_index, y = n_comments, fill = isAuthorWMF)) + facet_grid(~source) + - geom_dots(side = "both", layout = "hex", stackratio = 0.92) + + geom_dots(side= "both", layout = "hex", stackratio = 0.92) + scale_fill_viridis_d() + xlim(-130, 15) + theme_minimal() + diff --git a/analysis_data/style_dict_variables.R b/analysis_data/style_dict_variables.R index 2f26a05..7284ed1 100644 --- a/analysis_data/style_dict_variables.R +++ b/analysis_data/style_dict_variables.R @@ -7,6 +7,19 @@ library(purrr) unified_csv <-"~/analysis_data/102725_unified.csv" unified_df <- read.csv(unified_csv, header = TRUE) +unified_df <- unified_df |> mutate(has_update = !is.na(olmo_sentence_labels) & + grepl("(ACTION ON ISSUE|TASK PROGRESS)", as.character(olmo_sentence_labels), + ignore.case = TRUE), + has_update = ifelse( + has_update, 1, 0 + )) + +model <- glm(has_update ~ PC1 + PC2 + PC3 + PC4 + modal_verbs, + family = binomial(link = "logit"), + data = unified_df) +summary(model) + + unified_df |> ggplot( diff --git a/dsl/dsl.R b/dsl/dsl.R new file mode 100644 index 0000000..2b105ca --- /dev/null +++ b/dsl/dsl.R @@ -0,0 +1,56 @@ +library(tidyverse) +library(dsl) + +dsl_csv <-"~/dsl/102725_DSL_df_adac.csv" +dsl_df <- read.csv(dsl_csv, header = TRUE) + + +power_model <- power_dsl( + labeled_size = c(100, 200, 300, 600, 1000), + model = "logit", + formula = dsl_score ~ human_SOL_prop_adac + + median_gerrit_loc_delta + median_gerrit_reviewers + + as.factor(isAuthorWMF) + + as.factor(source) + + median_PC3_adac + + week_index, + predicted_var = "human_SOL_prop_adac", + prediction = "olmo_SOL_prop_adac", + sample_prob = "sampling_prob", + data=dsl_df +) +summary(power_model) +plot(power_model, coef_name = "human_SOL_prop_adac") + +dsl_df <- dsl_df |> + filter(source=="c1") + +trial_model <- dsl( + model = "logit", + formula = dsl_score ~ human_BI_prop_adac + + median_gerrit_loc_delta + median_gerrit_reviewers + + as.factor(isAuthorWMF) + + as.factor(author_closer) + + median_PC4_adac + + week_index, + predicted_var = "human_BI_prop_adac", + prediction = "olmo_BI_prop_adac", + sample_prob = "sampling_prob", + data=dsl_df +) +summary(trial_model) + +style_model <- dsl( + model = "lm", + formula = human_BE_prop ~ + median_PC1 + median_PC4 + + as.factor(isAuthorWMF) + + as.factor(author_closer) + + median_PC3 + + week_index, + predicted_var = "human_BE_prop", + prediction = "olmo_BE_prop", + sample_prob = "sampling_prob", + data=dsl_df +) +summary(style_model) diff --git a/dsl/final_bivariate.R b/dsl/final_bivariate.R new file mode 100644 index 0000000..94e7418 --- /dev/null +++ b/dsl/final_bivariate.R @@ -0,0 +1,45 @@ +library(tidyverse) +#library(dsl) +library(dplyr) +dsl_csv <-"~/dsl/102725_DSL_df_adac.csv" +dsl_df <- read.csv(dsl_csv, header = TRUE) + +outcome_summary <- dsl_df |> + group_by(source, isAuthorWMF)|> + summarise( + total_sum = sum(!is.na(resolution_outcome)), + count_resolution_outcome = sum(resolution_outcome), + success_prop = count_resolution_outcome / total_sum, + median_ttr_days = median(TTR, na.rm = TRUE) / 24 + ) + + +library(ggplot2) +library(ggdist) + + +signed_power <- function(x, p) { + sign(x) * abs(x) ^ p +} + +signed_log <- function(x) sign(x) * log1p(abs(x)) +dsl_df <- dsl_df |> + mutate( + sp_med_pc3_adac = signed_power(median_PC3_adac, 0.2), + sp_med_pc4_adac = signed_power(median_PC4_adac, 0.2), + sl_med_pc4_adac = signed_log(median_PC4_adac), + sl_med_pc3_adac = signed_log(median_PC3_adac) + ) + + +ggplot(dsl_df, aes( + y= log1p(TTR/24), + x=sl_med_pc4_adac, + shape=isAuthorWMF, + color=isAuthorWMF + )) + + facet_grid(~source) + + theme_minimal() + + geom_smooth(method="loess", span=0.5) + + geom_point() + + scale_color_viridis_d() diff --git a/dsl/survival.R b/dsl/survival.R index 648823b..2784d24 100644 --- a/dsl/survival.R +++ b/dsl/survival.R @@ -3,8 +3,6 @@ library(tidyverse) dsl_csv <-"~/dsl/102725_DSL_df_adac.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) #https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html -dsl_df <- dsl_df |> - filter(source == "c1") library(survival) library(broom) @@ -12,8 +10,8 @@ dsl_df$ttr_weeks <- dsl_df$TTR / 168 trial.survival <- Surv(dsl_df$ttr_weeks) trial.model <- coxph(trial.survival ~ isAuthorWMF + median_PC3_adac + week_index + - median_gerrit_loc_delta + median_gerrit_reviewers + - olmo_BI_prop_adac, data=dsl_df) + median_gerrit_loc_delta + median_gerrit_reviewers + source + + phase + author_closer, data=dsl_df) summary(trial.model) trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T) @@ -26,3 +24,5 @@ ggplot(trial.tab, surv.at.means <- survfit(trial.model) plot(surv.at.means, xlab="weeks", ylab="survival probability") +#https://hbiostat.org/stat/binarysurv +plot(surv.at.means, xlab = "weeks", ylab = "survival probability", xlim = c(0, 26)) diff --git a/mgaughan-rstudio-server_30651103.out b/mgaughan-rstudio-server_30651103.out new file mode 100644 index 0000000..fc4ec8d --- /dev/null +++ b/mgaughan-rstudio-server_30651103.out @@ -0,0 +1,17 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3439:51687 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: JZoMumQwFbIc7EUcaGbK + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 30651103