From 13d2113b737d5d2d553b5ec247ef0d48f967cddb Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Tue, 18 Nov 2025 13:00:07 -0800 Subject: [PATCH] updating dsl fitting --- dsl/dsl.R | 74 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/dsl/dsl.R b/dsl/dsl.R index 9401812..b7400c6 100644 --- a/dsl/dsl.R +++ b/dsl/dsl.R @@ -24,49 +24,67 @@ case_model <- dsl( ) summary(case_model) -trial_model <- dsl( +logit_model <- dsl( model = "logit", formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac - + as.factor(source) + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before, + + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before + as.factor(source), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), - sample_prob = "sampling_prob", + sample_prob = "sampling_prob", + cluster="source", + cross_fit = 3, + sample_split = 20, data=dsl_df ) -summary(trial_model) - -anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers) -chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer)) - -c1_df <- dsl_df |> - dplyr::filter(source=="c1") - +summary(logit_model) +#anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers) +#chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer)) +felm_df <- dsl_df |> + dplyr::mutate(ttr_days = TTR / 24) +# https://cscu.cornell.edu/wp-content/uploads/clust.pdf +# https://statmodeling.stat.columbia.edu/2020/01/10/linear-or-logistic-regression-with-binary-outcomes/ +# https://osf.io/preprints/psyarxiv/4gmbv_v1 felm_model <- dsl( model = "felm", - formula = TTR ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac - + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before, + formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + + week_index + median_PC4_adac + n_comments_before + + isAuthorWMF + median_gerrit_reviewers, predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", fixed_effect = "oneway", index = c("source"), cluster="source", - data=dsl_df + cross_fit = 3, + sample_split = 20, + data=felm_df ) summary(felm_model) #https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39 -felm_df <- dsl_df |> - dplyr::mutate(ttr_days = TTR / 24) -felm_model <- dsl( - model = "felm", - formula = ttr_days ~ human_EP_prop_adac, - predicted_var = c("human_EP_prop_adac"), - prediction = c("olmo_EP_prop_adac"), - sample_prob = "sampling_prob", - fixed_effect = "oneway", - index = c("phase"), - cluster="phase", - data=felm_df -) -summary(felm_model) + +library(broom) +library(dplyr) +tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) { + res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...)) + terms <- row.names(res) + cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value") + if (conf.int) { + cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper") + } + out <- as.list(res)[cols] + names(out) <- names(cols) + out <- as_tibble(as.data.frame(out)) + out <- dplyr::bind_cols(term = terms, out) + if (exponentiate) + out <- broom:::exponentiate(out) + return(out) +} +coef_df <- tidy.dsl(felm_model) +ggplot(coef_df, aes(x = estimate, y = term)) + + geom_point(size = 1) + + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) + + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + + labs(title = "Fixed Effects Model Coefficients", + x = "Coefficient Estimate", + y = "Variable") + + theme_minimal()