diff --git a/dsl/dsl.R b/dsl/dsl.R index 0011926..9401812 100644 --- a/dsl/dsl.R +++ b/dsl/dsl.R @@ -4,51 +4,69 @@ library(dsl) dsl_csv <-"111725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE) - -power_model <- power_dsl( - labeled_size = c(100, 200, 300, 600, 1000), +base_model <- dsl( model = "logit", - formula = dsl_score ~ human_SOL_prop_adac + - median_gerrit_loc_delta + median_gerrit_reviewers + - as.factor(isAuthorWMF) + - as.factor(source) + - median_PC3_adac + - week_index, - predicted_var = "human_SOL_prop_adac", - prediction = "olmo_SOL_prop_adac", + formula = dsl_score ~ human_EP_prop_adac, + predicted_var = "human_EP_prop_adac", + prediction = "olmo_EP_prop_adac", sample_prob = "sampling_prob", data=dsl_df ) -summary(power_model) -plot(power_model, coef_name = "human_SOL_prop_adac") +summary(base_model) +case_model <- dsl( + model = "logit", + formula = dsl_score ~ human_EP_prop_adac + as.factor(source), + predicted_var = "human_EP_prop_adac", + prediction = "olmo_EP_prop_adac", + sample_prob = "sampling_prob", + data=dsl_df +) +summary(case_model) trial_model <- dsl( model = "logit", - formula = dsl_score ~ human_TSOL_prop_adac + - median_gerrit_loc_delta + median_gerrit_reviewers + - as.factor(isAuthorWMF) + - as.factor(author_closer) + - median_PC4_adac + - week_index + n_comments_before, - predicted_var = "human_TSOL_prop_adac", - prediction = "olmo_TSOL_prop_adac", + formula = dsl_score ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + + as.factor(source) + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before, + predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), + prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", data=dsl_df ) summary(trial_model) -style_model <- dsl( - model = "lm", - formula = human_BE_prop ~ - median_PC1 + median_PC4 + - as.factor(isAuthorWMF) + - as.factor(author_closer) + - median_PC3 + - week_index, - predicted_var = "human_BE_prop", - prediction = "olmo_BE_prop", +anova(dsl_df$olmo_RK_prop, dsl_df$median_gerrit_reviewers) +chisq.test(table(dsl_df$isAuthorWMF, dsl_df$author_closer)) + +c1_df <- dsl_df |> + dplyr::filter(source=="c1") + +felm_model <- dsl( + model = "felm", + formula = TTR ~ human_EP_prop_adac + human_TSOL_prop_adac + human_RK_prop_adac + + week_index + as.factor(isAuthorWMF) + median_PC4_adac + n_comments_before, + predicted_var = c("human_EP_prop_adac", "human_TSOL_prop_adac", "human_RK_prop_adac"), + prediction = c("olmo_EP_prop_adac", "olmo_TSOL_prop_adac", "olmo_RK_prop_adac"), sample_prob = "sampling_prob", + fixed_effect = "oneway", + index = c("source"), + cluster="source", data=dsl_df ) -summary(style_model) +summary(felm_model) + +#https://github.com/naoki-egami/dsl/blob/537664a54163dda52ee277071fdfd9e8df2572a6/R/estimate_g.R#L39 +felm_df <- dsl_df |> + dplyr::mutate(ttr_days = TTR / 24) +felm_model <- dsl( + model = "felm", + formula = ttr_days ~ human_EP_prop_adac, + predicted_var = c("human_EP_prop_adac"), + prediction = c("olmo_EP_prop_adac"), + sample_prob = "sampling_prob", + fixed_effect = "oneway", + index = c("phase"), + cluster="phase", + data=felm_df +) +summary(felm_model) diff --git a/mgaughan-rstudio-server_31035935.out b/mgaughan-rstudio-server_31035935.out deleted file mode 100644 index 405be93..0000000 --- a/mgaughan-rstudio-server_31035935.out +++ /dev/null @@ -1,17 +0,0 @@ -1. SSH tunnel from your workstation using the following command: - - ssh -N -L 8787:n3439:35765 mjilg@klone.hyak.uw.edu - - and point your web browser to http://localhost:8787 - -2. log in to RStudio Server using the following credentials: - - user: mjilg - password: QKOjN5O9o8KE4QlK+t4M - -When done using RStudio Server, terminate the job by: - -1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) -2. Issue the following command on the login node: - - scancel -f 31035935