1
0

Added, but didn't test the remaining robustness checks.

This commit is contained in:
Nathan TeBlunthuis 2022-12-11 22:46:30 -08:00
parent 8ac33c14d7
commit d8bc08f18f
8 changed files with 264 additions and 27 deletions

View File

@ -30,11 +30,11 @@ source("simulation_base.R")
#### how much power do we get from the model in the first place? (sweeping N and m) #### how much power do we get from the model in the first place? (sweeping N and m)
#### ####
simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, seed=1){ simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, Px=0.5, seed=1){
set.seed(seed) set.seed(seed)
z <- rnorm(N,sd=0.5) z <- rnorm(N,sd=0.5)
# x.var.epsilon <- var(Bzx *z) * ((1-zx_explained_variance)/zx_explained_variance) # x.var.epsilon <- var(Bzx *z) * ((1-zx_explained_variance)/zx_explained_variance)
xprime <- Bzx * z #+ x.var.epsilon xprime <- Bzx * z + qlogis(Px)
x <- rbinom(N,1,plogis(xprime)) x <- rbinom(N,1,plogis(xprime))
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bxy*x,Bzy*z)) * ((1-y_explained_variance)/y_explained_variance) y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bxy*x,Bzy*z)) * ((1-y_explained_variance)/y_explained_variance)
@ -78,16 +78,18 @@ parser <- add_argument(parser, "--truth_formula", help='formula for the true var
parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3) parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3) parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
parser <- add_argument(parser, "--Bxy", help='Effect of x on y', default=0.3) parser <- add_argument(parser, "--Bxy", help='Effect of x on y', default=0.3)
parser <- add_argument(parser, "--Px", help='Base rate of x', default=0.5)
args <- parse_args(parser) args <- parse_args(parser)
B0 <- 0 B0 <- 0
Px <- args$Px
Bxy <- args$Bxy Bxy <- args$Bxy
Bzy <- args$Bzy Bzy <- args$Bzy
Bzx <- args$Bzx Bzx <- args$Bzx
df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, seed=args$seed + 500, y_explained_variance = args$y_explained_variance, prediction_accuracy=args$prediction_accuracy) df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, Px, seed=args$seed + 500, y_explained_variance = args$y_explained_variance, prediction_accuracy=args$prediction_accuracy)
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='') result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula)) outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula))

View File

@ -31,11 +31,11 @@ source("simulation_base.R")
## one way to do it is by adding correlation to x.obs and y that isn't in w. ## one way to do it is by adding correlation to x.obs and y that isn't in w.
## in other words, the model is missing an important feature of x.obs that's related to y. ## in other words, the model is missing an important feature of x.obs that's related to y.
simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,accuracy_imbalance_difference=0.3){ simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,Px=0.5,accuracy_imbalance_difference=0.3){
set.seed(seed) set.seed(seed)
# make w and y dependent # make w and y dependent
z <- rnorm(N,sd=0.5) z <- rnorm(N,sd=0.5)
x <- rbinom(N, 1, plogis(Bzx * z)) x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance) y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance)
y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon)) y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
@ -140,10 +140,12 @@ parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy va
parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5) parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0) parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z") parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
args <- parse_args(parser) args <- parse_args(parser)
B0 <- 0 B0 <- 0
Px <- args$Px
Bxy <- args$Bxy Bxy <- args$Bxy
Bzy <- args$Bzy Bzy <- args$Bzy
Bzx <- args$Bzx Bzx <- args$Bzx
@ -157,7 +159,7 @@ if(args$m < args$N){
## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05) ## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
## plot(pc.df) ## plot(pc.df)
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=args$Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='') result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, .'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula)) outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula))

View File

@ -76,12 +76,13 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.01) parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.01)
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.01) parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.01)
parser <- add_argument(parser, "--Bzx", help='coeffficient of z on x', default=-0.5) parser <- add_argument(parser, "--Bzx", help='coeffficient of z on x', default=-0.5)
parser <- add_argument(parser, "--B0", help='Base rate of y', default=0.5)
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y") parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y")
args <- parse_args(parser) args <- parse_args(parser)
B0 <- 0 B0 <- args$B0
Bxy <- args$Bxy Bxy <- args$Bxy
Bzy <- args$Bzy Bzy <- args$Bzy
Bzx <- args$Bzx Bzx <- args$Bzx

View File

@ -31,14 +31,14 @@ source("simulation_base.R")
## one way to do it is by adding correlation to x.obs and y that isn't in w. ## one way to do it is by adding correlation to x.obs and y that isn't in w.
## in other words, the model is missing an important feature of x.obs that's related to y. ## in other words, the model is missing an important feature of x.obs that's related to y.
simulate_data <- function(N, m, B0, Bxy, Bzy, seed, prediction_accuracy=0.73, x_bias=-0.75){ simulate_data <- function(N, m, B0, Bxy, Bzy, Py, seed, prediction_accuracy=0.73, x_bias=-0.75){
set.seed(seed) set.seed(seed)
# make w and y dependent # make w and y dependent
z <- rbinom(N, 1, 0.5) z <- rbinom(N, 1, 0.5)
x <- rbinom(N, 1, 0.5) x <- rbinom(N, 1, 0.5)
ystar <- Bzy * z + Bxy * x + B0 ystar <- Bzy * z + Bxy * x + B0 + qlogix(Py)
y <- rbinom(N,1,plogis(ystar)) y <- rbinom(N,1,plogis(ystar))
# glm(y ~ x + z, family="binomial") # glm(y ~ x + z, family="binomial")
@ -77,6 +77,7 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
parser <- add_argument(parser, "--x_bias", help='how is the classifier biased?', default=0.75) parser <- add_argument(parser, "--x_bias", help='how is the classifier biased?', default=0.75)
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3) parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3)
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3) parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3)
parser <- add_argument(parser, "--Py", help='Base rate of y', default=0.5)
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*x") parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*x")

View File

@ -76,12 +76,13 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
parser <- add_argument(parser, "--z_bias", help='how is the classifier biased?', default=1.5) parser <- add_argument(parser, "--z_bias", help='how is the classifier biased?', default=1.5)
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.1) parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.1)
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.1) parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.1)
parser <- add_argument(parser, "--B0", help='coeffficient of z on y', default=-0.1)
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y+z") parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y+z")
args <- parse_args(parser) args <- parse_args(parser)
B0 <- 0 B0 <- args$B0
Bxy <- args$Bxy Bxy <- args$Bxy
Bzy <- args$Bzy Bzy <- args$Bzy

View File

@ -148,21 +148,204 @@ robustness_1_dv.feather: robustness_1_dv_jobs
robustness_1_dv.RDS: robustness_1_dv.feather robustness_1_dv.RDS: robustness_1_dv.feather
rm -f $@ rm -f $@
${srun} Rscript plot_dv_example.R --infile $< --name "robustness_1_dv" --outfile $@ ${srun} Rscript plot_dv_example.R --infile $< --name "robustness_1_dv" --outfile $@
robustness_2_jobs: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
robustness_2_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
rm -f $@ rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.6,0.73,0.8,0.85,0.9,0.95]}' --outfile $@ ${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@
robustness_2_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@
robustness_2_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@
START=1 robustness_2_jobs_p4: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
END=$(shell cat robustness_2_jobs | wc -l) rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
START=0
END_1=$(shell cat robustness_2_jobs_p1 | wc -l)
END_2=$(shell cat robustness_2_jobs_p2 | wc -l)
END_3=$(shell cat robustness_2_jobs_p3 | wc -l)
END_4=$(shell cat robustness_2_jobs_p4 | wc -l)
STEP=1000 STEP=1000
ITEMS=$(shell seq $(START) $(STEP) $(END)) ONE=1
ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
ITEMS_4=$(shell seq $(START) $(STEP) $(END_4))
robustness_2.feather: robustness_2_jobs robustness_2.feather: robustness_2_jobs_p1 robustness_2_jobs_p2 robustness_2_jobs_p3 robustness_2_jobs_p4
$(foreach item,$(ITEMS),sbatch --wait --verbose --array=$(shell expr $(item))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 $<) $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p1)
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p2;)
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p3;)
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p4;)
robustness_2_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@
robustness_2_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@
robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@
robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
START=0
END_1=$(shell cat robustness_2_dv_jobs_p1 | wc -l)
END_2=$(shell cat robustness_2_dv_jobs_p2 | wc -l)
END_3=$(shell cat robustness_2_dv_jobs_p3 | wc -l)
END_4=$(shell cat robustness_2_dv_jobs_p4 | wc -l)
STEP=1000
ONE=1
ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
ITEMS_4=$(shell seq $(START) $(STEP) $(END_4))
robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p1)
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p2;)
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p3;)
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p4;)
robustness_3_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.5,0.6], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
robustness_3_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.7,0.8], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
robustness_3_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"Px":[0.9,0.95], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
START=0
END_1=$(shell cat robustness_3_jobs_p1 | wc -l)
END_2=$(shell cat robustness_3_jobs_p2 | wc -l)
END_3=$(shell cat robustness_3_jobs_p3 | wc -l)
STEP=1000
ONE=1
ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
robustness_3.feather: robustness_3_jobs_p1 robustness_3_jobs_p2 robustness_3_jobs_p3
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p1)
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p2;)
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p3;)
robustness_3_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5,0.6], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
robustness_3_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.7,0.8], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
robustness_3_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.9,0.95], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
START=0
END_1=$(shell cat robustness_3_dv_jobs_p1 | wc -l)
END_2=$(shell cat robustness_3_dv_jobs_p2 | wc -l)
END_3=$(shell cat robustness_3_dv_jobs_p3 | wc -l)
STEP=1000
ONE=1
ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
robustness_3_dv.feather: robustness_3_dv_jobs_p1 robustness_3_dv_jobs_p2 robustness_3_dv_jobs_p3
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p1)
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p2;)
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p3;)
robustness_4_jobs_p1: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-1,-0.85]}' --outfile $@
robustness_4_jobs_p2: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85], y_bias=[-0.70,-0.55]}' --outfile $@
robustness_4_jobs_p3: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-0.4,-0.25]}' --outfile $@
robustness_4_jobs_p4: grid_sweep.py 02_indep_differential.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85],y_bias=[-0.1,0]}' --outfile $@
START=0
END_1=$(shell cat robustness_4_jobs_p1 | wc -l)
END_2=$(shell cat robustness_4_jobs_p2 | wc -l)
END_3=$(shell cat robustness_4_jobs_p3 | wc -l)
END_4=$(shell cat robustness_4_jobs_p3 | wc -l)
STEP=1000
ONE=1
ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
ITEMS_4=$(shell seq $(START) $(STEP) $(END_4))
robustness_4.feather: robustness_4_jobs_p1 robustness_4_jobs_p2 robustness_4_jobs_p3
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p1)
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p2;)
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p3;)
robustness_4_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5] "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0,0.1]}' --outfile $@
robustness_4_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3],"B0":[0.5] "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.25,0.4]}' --outfile $@
robustness_4_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.5], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.55,0.7]}' --outfile $@
robustness_4_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "B0":[0.5], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],z_bias=[0.85,1]}' --outfile $@
START=0
END_1=$(shell cat robustness_4_dv_jobs_p1 | wc -l)
END_2=$(shell cat robustness_4_dv_jobs_p2 | wc -l)
END_3=$(shell cat robustness_4_dv_jobs_p3 | wc -l)
STEP=1000
ONE=1
ITEMS_1=$(shell seq $(START) $(STEP) $(END_1))
ITEMS_2=$(shell seq $(START) $(STEP) $(END_2))
ITEMS_3=$(shell seq $(START) $(STEP) $(END_3))
robustness_4_dv.feather: robustness_4_dv_jobs_p1 robustness_4_dv_jobs_p2 robustness_4_dv_jobs_p3
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p1)
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p2;)
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p3;)
# #
clean: clean:

View File

@ -12,3 +12,29 @@ Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the mai
This is just example 1 with varying levels of classifier accuracy. This is just example 1 with varying levels of classifier accuracy.
# robustness_2_dv.RDS
Example 3 with varying levels of classifier accuracy
# robustness_3.RDS
Example 1 with varying levels of skewness in the classified variable. The variable `Px` is the baserate of $X$ and controls the skewness of $X$.
It probably makes more sense to report the mean of $X$ instead of `Px` in the supplement.
# robustness_3_dv.RDS
Example 3 with varying levels of skewness in the classified variable. The variable `B0` is the intercept of the main model and controls the skewness of $Y$.
It probably makes more sense to report the mean of $Y$ instead of B0 in the supplement.
# robustness_4.RDS
Example 2 with varying amounts of differential error. The variable `y_bias` controls the amount of differential error.
It probably makes more sense to report the corrleation between $Y$ and $X-~$, or the difference in accuracy from when when $Y=1$ to $Y=0$ in the supplement instead of `y_bias`.
# robustness_4_dv.RDS
Example 4 with varying amounts of bias. The variable `z_bias` controls the amount of differential error.
It probably makes more sense to report the corrleation between $Z$ and $Y-W$, or the difference in accuracy from when when $Z=1$ to $Z=0$ in the supplement instead of `z_bias`.

View File

@ -151,10 +151,10 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu
temp.df <- copy(df) temp.df <- copy(df)
temp.df[,y:=y.obs] temp.df[,y:=y.obs]
mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula) mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula)
fisher.info <- solve(mod.caroll.lik$hessian) fischer.info <- solve(mod.caroll.lik$hessian)
coef <- mod.caroll.lik$par coef <- mod.caroll.lik$par
ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
result <- append(result, result <- append(result,
list(Bxy.est.mle = coef['x'], list(Bxy.est.mle = coef['x'],
Bxy.ci.upper.mle = ci.upper['x'], Bxy.ci.upper.mle = ci.upper['x'],
@ -299,11 +299,32 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL
temp.df <- copy(df) temp.df <- copy(df)
temp.df <- temp.df[,x:=x.obs] temp.df <- temp.df[,x:=x.obs]
mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
fisher.info <- solve(mod.caroll.lik$hessian)
coef <- mod.caroll.lik$par ## tryCatch({
ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient')
ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 ## (mod.calibrated.mle)
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.ci.upper.mecor = mecor.ci['UCI'],
## Bxy.ci.lower.mecor = mecor.ci['LCI'])
## )
fischer.info <- NA
ci.upper <- NA
ci.lower <- NA
tryCatch({fischer.info <- solve(mod.caroll.lik$hessian)
ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
},
error=function(e) {result[['error']] <- as.character(e)
})
coef <- mod.caroll.lik$par
result <- append(result, result <- append(result,
list(Bxy.est.mle = coef['x'], list(Bxy.est.mle = coef['x'],