1
0

updating dsl fitting

This commit is contained in:
Matthew Gaughan 2025-11-18 13:00:07 -08:00
parent 6092e21977
commit 13d2113b73

View File

@ -24,49 +24,67 @@ case_model <- dsl(
) )
summary(case_model) summary(case_model)
trial_model <- dsl( logit_model <- dsl(
model = "logit", model = "logit",
formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
+ as.factor(source) + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before, + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before + as.factor(source),
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
sample_prob = "sampling_prob", sample_prob = "sampling_prob",
cluster="source",
cross_fit = 3,
sample_split = 20,
data=dsl_df data=dsl_df
) )
summary(trial_model) summary(logit_model)
#anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers)
anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers) #chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer))
chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer)) felm_df <- dsl_df |>
dplyr::mutate(ttr_days = TTR / 24)
c1_df <- dsl_df |> # https://cscu.cornell.edu/wp-content/uploads/clust.pdf
dplyr::filter(source=="c1") # https://statmodeling.stat.columbia.edu/2020/01/10/linear-or-logistic-regression-with-binary-outcomes/
# https://osf.io/preprints/psyarxiv/4gmbv_v1
felm_model <- dsl( felm_model <- dsl(
model = "felm", model = "felm",
formula = TTR ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac +
+ week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before, week_index + median_PC4_adac + n_comments_before + + isAuthorWMF + median_gerrit_reviewers,
predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
sample_prob = "sampling_prob", sample_prob = "sampling_prob",
fixed_effect = "oneway", fixed_effect = "oneway",
index = c("source"), index = c("source"),
cluster="source", cluster="source",
data=dsl_df cross_fit = 3,
sample_split = 20,
data=felm_df
) )
summary(felm_model) summary(felm_model)
#https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39 #https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39
felm_df <- dsl_df |>
dplyr::mutate(ttr_days = TTR / 24) library(broom)
felm_model <- dsl( library(dplyr)
model = "felm", tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
formula = ttr_days ~ human_EP_prop_adac, res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
predicted_var = c("human_EP_prop_adac"), terms <- row.names(res)
prediction = c("olmo_EP_prop_adac"), cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
sample_prob = "sampling_prob", if (conf.int) {
fixed_effect = "oneway", cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
index = c("phase"), }
cluster="phase", out <- as.list(res)[cols]
data=felm_df names(out) <- names(cols)
) out <- as_tibble(as.data.frame(out))
summary(felm_model) out <- dplyr::bind_cols(term = terms, out)
if (exponentiate)
out <- broom:::exponentiate(out)
return(out)
}
coef_df <- tidy.dsl(felm_model)
ggplot(coef_df, aes(x = estimate, y = term)) +
geom_point(size = 1) +
geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Fixed Effects Model Coefficients",
x = "Coefficient Estimate",
y = "Variable") +
theme_minimal()