library(tidyverse) library(dsl) dsl_csv <-"~/dsl/120725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) dsl_df <- dsl_df |> dplyr::mutate(ttr_days = TTR / 24) |> dplyr::mutate(task_resolution = dsl_score) base_model <- dsl( model = "logit", formula = dsl_score ~ human_EP_prop_adac, predicted_var = "human_EP_prop_adac", prediction = "olmo_EP_prop_adac", sample_prob = "sampling_prob", data=dsl_df ) summary(base_model) case_model <- dsl( model = "logit", formula = dsl_score ~ human_EP_prop_adac + as.factor(source), predicted_var = "human_EP_prop_adac", prediction = "olmo_EP_prop_adac", sample_prob = "sampling_prob", data=dsl_df ) summary(case_model) logit_model <- dsl( model = "logit", formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + week_index + as.factor(isAuthorWMF) + median_PC4_adac + median_PC3_adac + n_comments_before + as.factor(source) + median_gerrit_reviewers, predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", cluster="source", cross_fit = 3, sample_split = 20, data=dsl_df ) summary(logit_model) #anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers) #chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer)) # https://cscu.cornell.edu/wp-content/uploads/clust.pdf # https://statmodeling.stat.columbia.edu/2020/01/10/linear-or-logistic-regression-with-binary-outcomes/ # https://osf.io/preprints/psyarxiv/4gmbv_v1 felm_model <- dsl( model = "felm", formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + phase + median_PC4_adac + median_PC3_adac + n_comments_before + + isAuthorWMF, predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", fixed_effect = "oneway", index = c("source"), cluster="source", cross_fit = 3, sample_split = 20, data=felm_df ) summary(felm_model) #httpsfelm_model#https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39 dev_model <- dsl( model = "logit", formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + median_PC4_adac + median_PC3_adac + n_comments_before + median_gerrit_reviewers + median_gerrit_loc_delta + week_index + as.factor(isAuthorWMF) * as.factor(source), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", cluster="source", cross_fit = 3, sample_split = 20, data=dsl_df ) summary(dev_model) saveRDS(dev_model, "120725_logit_dsl.RDS") library(broom) library(dplyr) tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) { res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...)) terms <- row.names(res) cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value") if (conf.int) { cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper") } out <- as.list(res)[cols] names(out) <- names(cols) out <- as_tibble(as.data.frame(out)) out <- dplyr::bind_cols(term = terms, out) if (exponentiate) out <- broom:::exponentiate(out) return(out) } coef_df <- tidy.dsl(dev_model) coef_df <- coef_df |> mutate( term = recode(term, "week_index" = "Weeks from deployment", "(Intercept)" = "Intercept", "n_comments_before" = "# of comments prior to resolution", "median_PC4_adac" = "Median Author PC4 Pre-resolution", "median_PC3_adac" = "Median Author PC3 Pre-resolution", "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)", "median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)", "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'", "human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'", "human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'", "as.factor(source)c3" = "HTTP-deprecation (factor)", "as.factor(source)c2" = "HTTPS-as-default (factor)", "as.factor(isAuthorWMF)TRUE" = "WMF-affiliate Author (factor)", "as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliate Author:HTTPS-as-default", "as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliate Author:HTTP-deprecation", ), term = factor(term, levels = rev(c( "Intercept", "% of sentences discussing 'Existent Problems'", "% of sentences discussing 'Solutions'", "% of sentences discussing 'Record Keeping'", "Median Author PC4 Pre-resolution", "Median Author PC3 Pre-resolution", "# of comments prior to resolution", "Median # of Code Reviewers (Gerrit)", "Median LoC Changed (Gerrit)", "Weeks from deployment", "HTTPS-as-default (factor)", "HTTP-deprecation (factor)", "WMF-affiliate Author (factor)", "WMF-affiliate Author:HTTPS-as-default", "WMF-affiliate Author:HTTP-deprecation" ))) ) ggplot(coef_df, aes(x = estimate, y = term)) + geom_point(size = 1) + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + labs(title = "DSL Logit Model Coefficients", x = "Coefficient Estimate", y = "Variable") + theme_minimal()