From 86ee932c672c047a78444444ad4a5222d7717bfa Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Mon, 1 Dec 2025 13:44:49 -0800 Subject: [PATCH] updating with new analysis/new information for write up --- analysis_data/120125_data_verification.R | 8 +++ analysis_data/data_verification.R | 4 +- .../111725_DSL_frame.csv | 0 dsl/final_bivariate.R | 60 +++++++++++++++++-- dsl/survival.R | 8 +-- 5 files changed, 67 insertions(+), 13 deletions(-) create mode 100644 analysis_data/120125_data_verification.R rename 111725_DSL_frame.csv => dsl/111725_DSL_frame.csv (100%) diff --git a/analysis_data/120125_data_verification.R b/analysis_data/120125_data_verification.R new file mode 100644 index 0000000..c586795 --- /dev/null +++ b/analysis_data/120125_data_verification.R @@ -0,0 +1,8 @@ +library(tidyverse) + +main_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv" +main_df <- read.csv(main_csv, header = TRUE) + +c3_df <- main_df |> + filter(source=="c3") + diff --git a/analysis_data/data_verification.R b/analysis_data/data_verification.R index f69ae38..3d54561 100644 --- a/analysis_data/data_verification.R +++ b/analysis_data/data_verification.R @@ -10,10 +10,10 @@ library(purrr) # get the categorical variables encoded as integers, then wrapped as factors # figure out power at 200, 400, 500, 750, and 1000 #joining sentences with their -labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv" +labeled_csv <-"~/analysis_data/100625_constituent_dfs/092325_biberplus_complete_labels.csv" labeled_df <- read.csv(labeled_csv, header = TRUE) -main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv" +main_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv" main_df <- read.csv(main_csv, header = TRUE) dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1) diff --git a/111725_DSL_frame.csv b/dsl/111725_DSL_frame.csv similarity index 100% rename from 111725_DSL_frame.csv rename to dsl/111725_DSL_frame.csv diff --git a/dsl/final_bivariate.R b/dsl/final_bivariate.R index 1658dac..5fd2c0b 100644 --- a/dsl/final_bivariate.R +++ b/dsl/final_bivariate.R @@ -1,9 +1,55 @@ library(tidyverse) #library(dsl) library(dplyr) -dsl_csv <-"~/dsl/110925_DSL_df_adac.csv" +dsl_csv <-"~/dsl/111725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) +weekly_summary <- dsl_df |> + group_by(week_index, source, isAuthorWMF)|> + summarise( + tasks_made = sum(!is.na(resolution_outcome)), + count_resolution_outcome = sum(dsl_score), + author_closer_sum = sum(author_closer == TRUE), + median_olmo_EP_prop_adac = median(olmo_EP_prop_adac), + median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac), + median_comments_before_resolution = median(n_comments_before) + ) + +ggplot( + weekly_summary, + aes( + x=week_index, + y=tasks_made, + fill=isAuthorWMF + ) +) + + facet_grid(source ~ ., scales = "free_y") + + geom_col(position = position_dodge(width = 0.9), width = 0.8) + + geom_vline(data = weekly_summary |> filter(source == "c1"), + aes(xintercept = -29), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c1"), + aes(xintercept = -9), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c1"), + aes(xintercept = -4), + linetype = "3313", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c2"), + aes(xintercept = -99), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c2"), + aes(xintercept = -4), + linetype = "3313", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c3"), + aes(xintercept = -97), + linetype = "dotted", color = "black", linewidth = 0.5) + + geom_vline(data = weekly_summary |> filter(source == "c3"), + aes(xintercept = -3), + linetype = "3313", color = "black", linewidth = 0.5) + + geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) + + theme_minimal() + + scale_fill_viridis_d() + outcome_summary <- dsl_df |> group_by(source, isAuthorWMF)|> @@ -23,13 +69,15 @@ library(ggdist) ggplot( dsl_df, aes( - x=n_comments_before, - color=source, - fill=source + x=week_index, + y=olmo_EP_prop_adac, + color=isAuthorWMF ) ) + - facet_grid(~isAuthorWMF) + - stat_halfeye() + + facet_grid(source ~ .) + + geom_point() + + geom_smooth() + + scale_color_viridis_d() + theme_minimal() dsl_df <- dsl_df |> diff --git a/dsl/survival.R b/dsl/survival.R index 2784d24..ac92281 100644 --- a/dsl/survival.R +++ b/dsl/survival.R @@ -1,6 +1,6 @@ library(tidyverse) -dsl_csv <-"~/dsl/102725_DSL_df_adac.csv" +dsl_csv <-"~/dsl/111725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) #https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html @@ -8,10 +8,8 @@ library(survival) library(broom) dsl_df$ttr_weeks <- dsl_df$TTR / 168 trial.survival <- Surv(dsl_df$ttr_weeks) -trial.model <- coxph(trial.survival ~ isAuthorWMF + - median_PC3_adac + week_index + - median_gerrit_loc_delta + median_gerrit_reviewers + source + - phase + author_closer, data=dsl_df) +trial.model <- coxph(trial.survival ~ n_comments_before + + week_index + as.factor(isAuthorWMF) * as.factor(source), data=dsl_df) summary(trial.model) trial.tab <- tidy(trial.model, exponentiate=T, conf.int=T)