diff --git a/dsl/dsl.R b/dsl/dsl.R index b7400c6..4943e68 100644 --- a/dsl/dsl.R +++ b/dsl/dsl.R @@ -4,6 +4,10 @@ library(dsl) dsl_csv <-"111725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) +dsl_df <- dsl_df |> + dplyr::mutate(ttr_days = TTR / 24) |> + dplyr::mutate(task_resolution = dsl_score) + base_model <- dsl( model = "logit", formula = dsl_score ~ human_EP_prop_adac, @@ -27,7 +31,8 @@ summary(case_model) logit_model <- dsl( model = "logit", formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac - + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before + as.factor(source), + + week_index + as.factor(isAuthorWMF) + median_PC4_adac + median_PC3_adac + n_comments_before + as.factor(source) + + median_gerrit_reviewers, predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", @@ -39,15 +44,14 @@ logit_model <- dsl( summary(logit_model) #anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers) #chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer)) -felm_df <- dsl_df |> - dplyr::mutate(ttr_days = TTR / 24) # https://cscu.cornell.edu/wp-content/uploads/clust.pdf # https://statmodeling.stat.columbia.edu/2020/01/10/linear-or-logistic-regression-with-binary-outcomes/ # https://osf.io/preprints/psyarxiv/4gmbv_v1 felm_model <- dsl( model = "felm", formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + - week_index + median_PC4_adac + n_comments_before + + isAuthorWMF + median_gerrit_reviewers, + phase + + median_PC4_adac + median_PC3_adac + n_comments_before + + isAuthorWMF, predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", @@ -59,8 +63,25 @@ felm_model <- dsl( data=felm_df ) summary(felm_model) +#httpsfelm_model#https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39 + + +dev_model <- dsl( + model = "logit", + formula = task_resolution ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + + median_PC4_adac + median_PC3_adac + n_comments_before + + median_gerrit_reviewers + median_gerrit_loc_delta + + week_index + as.factor(source) * as.factor(isAuthorWMF), + predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), + prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), + sample_prob = "sampling_prob", + cluster="source", + cross_fit = 3, + sample_split = 20, + data=dsl_df +) +summary(dev_model) -#https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39 library(broom) library(dplyr) @@ -79,7 +100,7 @@ tidy.dsl <- function(x, conf.int = FALSE, conf.level = 0.95, exponentiate = FALS out <- broom:::exponentiate(out) return(out) } -coef_df <- tidy.dsl(felm_model) +coef_df <- tidy.dsl(dev_model) ggplot(coef_df, aes(x = estimate, y = term)) + geom_point(size = 1) + geom_errorbar(aes(xmin = estimate - 1.96*std.error, xmax = estimate + 1.96 *std.error), height = 0.2) +