updating with new analysis/new information for write up

2025-12-01 13:44:49 -08:00 · 2025-12-01 13:44:49 -08:00 · 86ee932c67
commit 86ee932c67
parent a0545ad8de
5 changed files with 67 additions and 13 deletions
--- a/analysis_data/120125_data_verification.R
+++ b/analysis_data/120125_data_verification.R
@ -0,0 +1,8 @@
+library(tidyverse)
+
+main_csv <- "~/analysis_data/102125_constituent_dfs/110525_olmo_batched_categorized.csv"
+main_df <- read.csv(main_csv, header = TRUE) 
+
+c3_df <- main_df |>
+  filter(source=="c3")
+
--- a/analysis_data/data_verification.R
+++ b/analysis_data/data_verification.R
@ -10,10 +10,10 @@ library(purrr)
    # get the categorical variables encoded as integers, then wrapped as factors
    # figure out power at 200, 400, 500, 750, and 1000 
 #joining sentences with their 
-labeled_csv <-"~/p2/quest/092325_biberplus_complete_labels.csv"
+labeled_csv <-"~/analysis_data/100625_constituent_dfs/092325_biberplus_complete_labels.csv"
 labeled_df <- read.csv(labeled_csv, header = TRUE) 

-main_csv <- "~/analysis_data/constituent_dfs/071425_master_discussion_data.csv"
+main_csv <- "~/analysis_data/100625_constituent_dfs/071425_master_discussion_data.csv"
 main_df <- read.csv(main_csv, header = TRUE) 

 dupes_labeled <- labeled_df %>% count(date_created, id, comment_text, AuthorPHID, TaskPHID) %>% filter(n > 1)
--- a/dsl/111725_DSL_frame.csv
+++ b/dsl/111725_DSL_frame.csv
--- a/dsl/final_bivariate.R
+++ b/dsl/final_bivariate.R
@ -1,9 +1,55 @@
 library(tidyverse)
 #library(dsl)
 library(dplyr)
-dsl_csv <-"~/dsl/110925_DSL_df_adac.csv"
+dsl_csv <-"~/dsl/111725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 

+weekly_summary <- dsl_df |>
+  group_by(week_index, source, isAuthorWMF)|>
+  summarise(
+    tasks_made = sum(!is.na(resolution_outcome)),
+    count_resolution_outcome = sum(dsl_score),
+    author_closer_sum  = sum(author_closer == TRUE),
+    median_olmo_EP_prop_adac = median(olmo_EP_prop_adac),
+    median_olmo_TSOL_prop_adac = median(olmo_TSOL_prop_adac),
+    median_comments_before_resolution = median(n_comments_before)
+  )
+
+ggplot(
+  weekly_summary,
+  aes(
+    x=week_index,
+    y=tasks_made, 
+    fill=isAuthorWMF
+  )
+) + 
+  facet_grid(source ~ ., scales = "free_y") +  
+  geom_col(position = position_dodge(width = 0.9), width = 0.8) +
+  geom_vline(data = weekly_summary |> filter(source == "c1"), 
+             aes(xintercept = -29), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c1"), 
+             aes(xintercept = -9), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c1"), 
+             aes(xintercept = -4), 
+             linetype = "3313", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c2"), 
+             aes(xintercept = -99), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c2"), 
+             aes(xintercept = -4),   
+             linetype = "3313", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c3"), 
+             aes(xintercept = -97), 
+             linetype = "dotted", color = "black", linewidth = 0.5) +
+  geom_vline(data = weekly_summary |> filter(source == "c3"), 
+             aes(xintercept = -3), 
+             linetype = "3313", color = "black", linewidth = 0.5) +
+  geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
+  theme_minimal() + 
+  scale_fill_viridis_d() 
+

 outcome_summary <- dsl_df |>
  group_by(source, isAuthorWMF)|>
@ -23,13 +69,15 @@ library(ggdist)
 ggplot(
  dsl_df, 
  aes(
-    x=n_comments_before,
-    color=source,
-    fill=source
+    x=week_index,
+    y=olmo_EP_prop_adac,
+    color=isAuthorWMF
  )
 ) +
-  facet_grid(~isAuthorWMF) +
-  stat_halfeye() +
+  facet_grid(source ~ .) +
+  geom_point() +
+  geom_smooth() + 
+  scale_color_viridis_d() +
  theme_minimal() 

 dsl_df <- dsl_df |>
--- a/dsl/survival.R
+++ b/dsl/survival.R
@ -1,6 +1,6 @@
 library(tidyverse)

-dsl_csv <-"~/dsl/102725_DSL_df_adac.csv"
+dsl_csv <-"~/dsl/111725_DSL_frame.csv"
 dsl_df <- read.csv(dsl_csv, header = TRUE) 
 #https://stats.oarc.ucla.edu/wp-content/uploads/2025/02/survival_r_full.html

@ -8,10 +8,8 @@ library(survival)
 library(broom)
 dsl_df$ttr_weeks <- dsl_df$TTR / 168
 trial.survival <- Surv(dsl_df$ttr_weeks)
-trial.model <- coxph(trial.survival ~ isAuthorWMF + 
-                       median_PC3_adac + week_index + 
-                       median_gerrit_loc_delta + median_gerrit_reviewers + source +
-                     phase + author_closer, data=dsl_df)
+trial.model <- coxph(trial.survival ~ n_comments_before 
+                     + week_index + as.factor(isAuthorWMF) * as.factor(source), data=dsl_df)
 summary(trial.model)
 trial.tab <- tidy(trial.model,  exponentiate=T, conf.int=T)