From 86ee932c672c047a78444444ad4a5222d7717bfa Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login03.hyak.local>
Date: Mon, 1 Dec 2025 13:44:49 -0800
Subject: [PATCH] updating with new analysis/new information for write up

---
 analysis_data/120125_data_verification.R      |  8 +++
 analysis_data/data_verification.R             |  4 +-
 .../111725_DSL_frame.csv                      |  0
 dsl/final_bivariate.R                         | 60 +++++++++++++++++--
 dsl/survival.R                                |  8 +--
 5 files changed, 67 insertions(+), 13 deletions(-)
 create mode 100644 analysis_data/120125_data_verification.R
 rename 111725_DSL_frame.csv => dsl/111725_DSL_frame.csv (100%)

diff --git a/analysis_data/120125_data_verification.R b/analysis_data/120125_data_verification.R
new file mode 100644
index 0000000..c586795
--- /dev/null
+++ b/analysis_data/120125_data_verification.R
@@ -0,0 +1,8 @@
+library(tidyverse)
+
+main_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+c3_df <- main_df |>
+  filter(source=="c3")
+
diff --git a/analysis_data/data_verification.R b/analysis_data/data_verification.R
index f69ae38..3d54561 100644
--- a/analysis_data/data_verification.R
+++ b/analysis_data/data_verification.R
@@ -10,10 +10,10 @@ library(purrr)
     # get the categorical variables encoded as integers, then wrapped as factors
     # figure out power at 200, 400, 500, 750, and 1000 
 #joining sentences with their 
-labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv"
+labeled_csv <-"~/analysis_data/100625_constituent_dfs/092325_biberplus_complete_labels.csv"
 labeled_df <- read.csv(labeled_csv, header = TRUE) 
 
-main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv"
+main_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv"
 main_df <- read.csv(main_csv, header = TRUE) 
 
 dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
diff --git a/111725_DSL_frame.csv b/dsl/111725_DSL_frame.csv
similarity index 100%
rename from 111725_DSL_frame.csv
rename to dsl/111725_DSL_frame.csv
diff --git a/dsl/final_bivariate.R b/dsl/final_bivariate.R
index 1658dac..5fd2c0b 100644
--- a/dsl/final_bivariate.R
+++ b/dsl/final_bivariate.R
@@ -1,9 +1,55 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
-dsl_csv <-"~/dsl/110925_DSL_df_adac.csv"
+dsl_csv <-"~/dsl/111725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 
+weekly_summary <- dsl_df |>
+  group_by(week_index, source, isAuthorWMF)|>
+  summarise(
+    tasks_made = sum(!is.na(resolution_outcome)),
+    count_resolution_outcome = sum(dsl_score),
+    author_closer_sum  = sum(author_closer == TRUE),
+    median_olmo_EP_prop_adac = median(olmo_EP_prop_adac),
+    median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac),
+    median_comments_before_resolution = median(n_comments_before)
+  )
+
+ggplot(
+  weekly_summary,
+  aes(
+    x=week_index,
+    y=tasks_made, 
+    fill=isAuthorWMF
+  )
+) + 
+  facet_grid(source ~ ., scales = "free_y") +  
+  geom_col(position = position_dodge(width = 0.9), width = 0.8) +
+  geom_vline(data = weekly_summary |> filter(source == "c1"), 
+             aes(xintercept = -29), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c1"), 
+             aes(xintercept = -9), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c1"), 
+             aes(xintercept = -4), 
+             linetype = "3313", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c2"), 
+             aes(xintercept = -99), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c2"), 
+             aes(xintercept = -4),   
+             linetype = "3313", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c3"), 
+             aes(xintercept = -97), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c3"), 
+             aes(xintercept = -3), 
+             linetype = "3313", color = "black", linewidth = 0.5) +
+  geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
+  theme_minimal() + 
+  scale_fill_viridis_d() 
+
 
 outcome_summary <- dsl_df |>
   group_by(source, isAuthorWMF)|>
@@ -23,13 +69,15 @@ library(ggdist)
 ggplot(
   dsl_df, 
   aes(
-    x=n_comments_before,
-    color=source,
-    fill=source
+    x=week_index,
+    y=olmo_EP_prop_adac,
+    color=isAuthorWMF
   )
 ) +
-  facet_grid(~isAuthorWMF) +
-  stat_halfeye() +
+  facet_grid(source ~ .) +
+  geom_point() +
+  geom_smooth() + 
+  scale_color_viridis_d() +
   theme_minimal() 
 
 dsl_df <- dsl_df |>
diff --git a/dsl/survival.R b/dsl/survival.R
index 2784d24..ac92281 100644
--- a/dsl/survival.R
+++ b/dsl/survival.R
@@ -1,6 +1,6 @@
 library(tidyverse)
 
-dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
+dsl_csv <-"~/dsl/111725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 #https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html
 
@@ -8,10 +8,8 @@ library(survival)
 library(broom)
 dsl_df$ttr_weeks <- dsl_df$TTR / 168
 trial.survival <- Surv(dsl_df$ttr_weeks)
-trial.model <- coxph(trial.survival ~ isAuthorWMF + 
-                       median_PC3_adac + week_index + 
-                       median_gerrit_loc_delta + median_gerrit_reviewers + source +
-                     phase + author_closer, data=dsl_df)
+trial.model <- coxph(trial.survival ~ n_comments_before 
+                     + week_index + as.factor(isAuthorWMF) * as.factor(source), data=dsl_df)
 summary(trial.model)
 trial.tab <- tidy(trial.model,  exponentiate=T, conf.int=T)