library(tidyverse) library(dsl) dsl_csv <-"~/dsl/121625_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) dsl_df <- dsl_df |> dplyr::mutate(ttr_days = TTR_hours / 24) |> dplyr::mutate(task_resolution = dsl_score) dev_model <- dsl( model = "logit", formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + median_PC4_adac + median_PC3_adac + n_comments_before + median_gerrit_reviewers + week_index + as.factor(isAuthorWMF) * as.factor(source), predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", cluster="source", cross_fit = 3, sample_split = 20, data=dsl_df ) summary(dev_model) #saveRDS(dev_model, "121625_logit_dsl.RDS") dev_model <- readRDS("dsl/121625_logit_dsl.RDS") summary(dev_model) library(broom) library(dplyr) tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) { res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...)) terms <- row.names(res) cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value") if (conf.int) { cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper") } out <- as.list(res)[cols] names(out) <- names(cols) out <- as_tibble(as.data.frame(out)) out <- dplyr::bind_cols(term = terms, out) if (exponentiate) out <- broom:::exponentiate(out) return(out) } coef_df <- tidy.dsl(dev_model) coef_df <- coef_df |> mutate( term = recode(term, "week_index" = "Weeks from deployment", "(Intercept)" = "Intercept", "n_comments_before" = "# of comments prior to resolution", "median_PC4_adac" = "Median Author PC4 Pre-resolution", "median_PC3_adac" = "Median Author PC3 Pre-resolution", "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)", "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'", "human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'", "human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'", "as.factor(source)c3" = "HTTP-deprecation (factor)", "as.factor(source)c2" = "HTTPS-login (factor)", "as.factor(isAuthorWMF)TRUE" = "WMF-affiliated Author (factor)", "as.factor(isAuthorWMF)FALSE" = "Nonaffiliated Author (factor)", "as.factor(isAuthorWMF)FALSE:as.factor(source)c2" = "Nonaffiliated Author:HTTPS-login", "as.factor(isAuthorWMF)FALSE:as.factor(source)c3" = "Nonaffiliated Author:HTTP-deprecation", "as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliated Author:HTTPS-login", "as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliated Author:HTTP-deprecation", ), term = factor(term, levels = rev(c( "Intercept", "% of sentences discussing 'Existent Problems'", "% of sentences discussing 'Solutions'", "% of sentences discussing 'Record Keeping'", "Median Author PC4 Pre-resolution", "Median Author PC3 Pre-resolution", "# of comments prior to resolution", "Median # of Code Reviewers (Gerrit)", "Weeks from deployment", "HTTPS-login (factor)", "HTTP-deprecation (factor)", "Nonaffiliated Author (factor)", "WMF-affiliated Author (factor)", "Nonaffiliated Author:HTTPS-login", "WMF-affiliated Author:HTTPS-login", "Nonaffiliated Author:HTTP-deprecation", "WMF-affiliated Author:HTTP-deprecation" ))) ) dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) + geom_point(size = 1) + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) + geom_vline(xintercept = 0, linetype = "dashed", color = "red") + labs(x = "Log-odds Coefficient Estimate", y = "Variable") + theme_minimal() dsl_coefs ggsave( filename = "121625_dsl_coefs.png", plot = dsl_coefs, width = 6, # inches height = 6, # inches dpi = 800 # high resolution )