From 13d2113b737d5d2d553b5ec247ef0d48f967cddb Mon Sep 17 00:00:00 2001
From: Matthew Gaughan <mjilg@klone-login03.hyak.local>
Date: Tue, 18 Nov 2025 13:00:07 -0800
Subject: [PATCH] updating dsl fitting

---
 dsl/dsl.R | 74 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 28 deletions(-)

diff --git a/dsl/dsl.R b/dsl/dsl.R
index 9401812..b7400c6 100644
--- a/dsl/dsl.R
+++ b/dsl/dsl.R
@@ -24,49 +24,67 @@ case_model <- dsl(
 )
 summary(case_model)
 
-trial_model <- dsl(
+logit_model <- dsl(
   model = "logit", 
   formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac 
-    + as.factor(source) + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before,
+     + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before + as.factor(source),
   predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
   prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
-  sample_prob = "sampling_prob",
+  sample_prob = "sampling_prob", 
+  cluster="source",
+  cross_fit = 3,
+  sample_split = 20,
   data=dsl_df
 )
-summary(trial_model)
-
-anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers)
-chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer))
-
-c1_df <- dsl_df |>
-  dplyr::filter(source=="c1")
-
+summary(logit_model)
+#anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers)
+#chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer))
+felm_df <- dsl_df |>
+  dplyr::mutate(ttr_days = TTR / 24)
+# https://cscu.cornell.edu/wp-content/uploads/clust.pdf
+# https://statmodeling.stat.columbia.edu/2020/01/10/linear-or-logistic-regression-with-binary-outcomes/
+# https://osf.io/preprints/psyarxiv/4gmbv_v1
 felm_model <- dsl(
   model = "felm", 
-  formula = TTR ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac 
-  + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before,
+  formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac +
+    week_index + median_PC4_adac + n_comments_before + + isAuthorWMF + median_gerrit_reviewers,
   predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
   prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
   sample_prob = "sampling_prob",
   fixed_effect = "oneway",
   index = c("source"),
   cluster="source",
-  data=dsl_df
+  cross_fit = 3,
+  sample_split = 20,
+  data=felm_df
 )
 summary(felm_model)
 
 #https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39
-felm_df <- dsl_df |>
-  dplyr::mutate(ttr_days = TTR / 24)
-felm_model <- dsl(
-  model = "felm", 
-  formula = ttr_days ~ human_EP_prop_adac,
-  predicted_var = c("human_EP_prop_adac"),
-  prediction = c("olmo_EP_prop_adac"),
-  sample_prob = "sampling_prob",
-  fixed_effect = "oneway",
-  index = c("phase"),
-  cluster="phase",
-  data=felm_df
-)
-summary(felm_model)
+
+library(broom)
+library(dplyr)
+tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
+  res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
+  terms <- row.names(res)
+  cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
+  if (conf.int) {
+    cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
+  }
+  out <- as.list(res)[cols]
+  names(out) <- names(cols)
+  out <- as_tibble(as.data.frame(out))
+  out <- dplyr::bind_cols(term = terms, out)
+  if (exponentiate)
+    out <- broom:::exponentiate(out)
+  return(out)
+}
+coef_df <- tidy.dsl(felm_model)
+ggplot(coef_df, aes(x = estimate, y = term)) +
+  geom_point(size = 1) +
+  geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
+  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
+  labs(title = "Fixed Effects Model Coefficients",
+       x = "Coefficient Estimate",
+       y = "Variable") +
+  theme_minimal()