mw-lifecycle-analysis/dsl/dsl.R

library(tidyverse)
library(dsl)

dsl_csv <-"~/dsl/120725_DSL_frame.csv"
dsl_df <- read.csv(dsl_csv, header = TRUE)

dsl_df <- dsl_df |>
  dplyr::mutate(ttr_days = TTR / 24) |>
  dplyr::mutate(task_resolution = dsl_score)

base_model <- dsl(
  model = "logit",
  formula = dsl_score ~ human_EP_prop_adac,
  predicted_var = "human_EP_prop_adac",
  prediction = "olmo_EP_prop_adac",
  sample_prob = "sampling_prob",
  data=dsl_df
)
summary(base_model)

case_model <- dsl(
  model = "logit",
  formula = dsl_score ~ human_EP_prop_adac + as.factor(source),
  predicted_var = "human_EP_prop_adac",
  prediction = "olmo_EP_prop_adac",
  sample_prob = "sampling_prob",
  data=dsl_df
)
summary(case_model)

logit_model <- dsl(
  model = "logit",
  formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
     + week_index + as.factor(isAuthorWMF) + median_PC4_adac + median_PC3_adac +  n_comments_before + as.factor(source) +
    median_gerrit_reviewers,
  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
  prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
  sample_prob = "sampling_prob",
  cluster="source",
  cross_fit = 3,
  sample_split = 20,
  data=dsl_df
)
summary(logit_model)
#anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers)
#chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer))
# https://cscu.cornell.edu/wp-content/uploads/clust.pdf
# https://statmodeling.stat.columbia.edu/2020/01/10/linear-or-logistic-regression-with-binary-outcomes/
# https://osf.io/preprints/psyarxiv/4gmbv_v1
felm_model <- dsl(
  model = "felm",
  formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac +
    phase
  + median_PC4_adac + median_PC3_adac  + n_comments_before + + isAuthorWMF,
  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
  prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
  sample_prob = "sampling_prob",
  fixed_effect = "oneway",
  index = c("source"),
  cluster="source",
  cross_fit = 3,
  sample_split = 20,
  data=felm_df
)
summary(felm_model)
#httpsfelm_model#https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39


dev_model <- dsl(
  model = "logit",
  formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac
    + median_PC4_adac + median_PC3_adac + n_comments_before
    + median_gerrit_reviewers + median_gerrit_loc_delta
    + week_index + as.factor(isAuthorWMF) * as.factor(source),
  predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"),
  prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"),
  sample_prob = "sampling_prob",
  cluster="source",
  cross_fit = 3,
  sample_split = 20,
  data=dsl_df
)
#summary(dev_model)
#saveRDS(dev_model, "120725_logit_dsl.RDS")
dev_model <- readRDS("dsl/120725_logit_dsl.RDS")
library(broom)
library(dplyr)
tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALSE, ...) {
  res <- suppressMessages(dsl:::summary.dsl(object = x, ci = conf.level, ...))
  terms <- row.names(res)
  cols <- c("estimate" = "Estimate", "std.error" = "Std. Error", "p.value" = "p value")
  if (conf.int) {
    cols <- c(cols, "conf.low" = "CI Lower", "conf.high" = "CI Upper")
  }
  out <- as.list(res)[cols]
  names(out) <- names(cols)
  out <- as_tibble(as.data.frame(out))
  out <- dplyr::bind_cols(term = terms, out)
  if (exponentiate)
    out <- broom:::exponentiate(out)
  return(out)
}
coef_df <- tidy.dsl(dev_model)
coef_df <- coef_df |>
  mutate(
    term = recode(term,
                  "week_index" = "Weeks from deployment",
                  "(Intercept)" = "Intercept",
                  "n_comments_before" = "# of comments prior to resolution",
                  "median_PC4_adac" = "Median Author PC4 Pre-resolution",
                  "median_PC3_adac" = "Median Author PC3 Pre-resolution",
                  "median_gerrit_reviewers" = "Median # of Code Reviewers (Gerrit)",
                  "median_gerrit_loc_delta" = "Median LoC Changed (Gerrit)",
                  "human_TSOL_prop_adac" = "% of sentences discussing 'Solutions'",
                  "human_RK_prop_adac" = "% of sentences discussing 'Record Keeping'",
                  "human_EP_prop_adac" = "% of sentences discussing 'Existent Problems'",
                  "as.factor(source)c3" = "HTTP-deprecation (factor)",
                  "as.factor(source)c2" = "HTTPS-as-default (factor)",
                  "as.factor(isAuthorWMF)TRUE" = "WMF-affiliate Author (factor)",
                  "as.factor(isAuthorWMF)TRUE:as.factor(source)c2" = "WMF-affiliate Author:HTTPS-as-default",
                  "as.factor(isAuthorWMF)TRUE:as.factor(source)c3" = "WMF-affiliate Author:HTTP-deprecation",
                  ),
    term = factor(term, levels = rev(c(
      "Intercept",
      "% of sentences discussing 'Existent Problems'",
      "% of sentences discussing 'Solutions'",
      "% of sentences discussing 'Record Keeping'",
      "Median Author PC4 Pre-resolution",
      "Median Author PC3 Pre-resolution",
      "# of comments prior to resolution",
      "Median # of Code Reviewers (Gerrit)",
      "Median LoC Changed (Gerrit)",
      "Weeks from deployment",
      "HTTPS-as-default (factor)",
      "HTTP-deprecation (factor)",
      "WMF-affiliate Author (factor)",
      "WMF-affiliate Author:HTTPS-as-default",
      "WMF-affiliate Author:HTTP-deprecation"
    )))
  )
dsl_coefs <- ggplot(coef_df, aes(x = estimate, y = term)) +
  geom_point(size = 1) +
  geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "DSL Logit Model Coefficients",
       x = "Coefficient Estimate",
       y = "Variable") +
  theme_minimal()
ggsave(
  filename = "120825_dsl_coefs.png",
  plot = dsl_coefs,
  width = 8,    # inches
  height = 6,   # inches
  dpi = 600     # high resolution
)