1
0

Make summarize estimator group correctly for robustness checks.

Also fix a possible bug in the MI logic and simplify the error
correction formula in example 2.
This commit is contained in:
Nathan TeBlunthuis 2023-02-11 12:26:48 -08:00
parent c45ea9dfeb
commit b8d2048cc5
5 changed files with 103 additions and 114 deletions

3
Makefile Normal file
View File

@ -0,0 +1,3 @@
all:
+$(MAKE) -C simulations
+$(MAKE) -C civil_comments

View File

@ -36,7 +36,7 @@ example_1.feather: example_1_jobs
sbatch --wait --verbose --array=4001-$(shell cat example_1_jobs | wc -l) run_simulation.sbatch 0 example_1_jobs
example_2_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py pl_methods.R
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y*z*x"]}' --outfile example_2_jobs
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+z+x"]}' --outfile example_2_jobs
example_2.feather: example_2_jobs
rm -f example_2.feather
@ -66,10 +66,10 @@ example_3.feather: example_3_jobs
sbatch --wait --verbose --array=4001-$(shell cat example_3_jobs | wc -l) run_simulation.sbatch 0 example_3_jobs
example_4_jobs: 04_depvar_differential.R simulation_base.R grid_sweep.py pl_methods.R
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --outfile example_4_jobs
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.3], "prediction_accuracy":[0.73]}' --outfile example_4_jobs
example_4.feather: example_4_jobs
rm -f example_4.feather
rm -f example_4.feather
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_4_jobs
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_4_jobs
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_4_jobs
@ -86,41 +86,6 @@ remembr.RDS:example_1.feather example_2.feather example_3.feather example_4.feat
${srun} Rscript plot_dv_example.R --infile example_4.feather --name "plot.df.example.4"
irr_Ns = [1000]
irr_ms = [150,300,600]
irr_seeds=${seeds}
irr_explained_variances=${explained_variances}
irr_coder_accuracy=[0.80]
example_5_jobs: 05_irr_indep.R irr_simulation_base.R grid_sweep.py pl_methods.R measerr_methods.R
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 05_irr_indep.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_5.feather"], "y_explained_variance":${irr_explained_variances}, "coder_accuracy":${irr_coder_accuracy}}' --outfile example_5_jobs
example_5.feather:example_5_jobs
rm -f example_5.feather
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_5_jobs
sbatch --wait --verbose --array=1001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 1000 example_5_jobs
# sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 2000 example_5_jobs
# sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 3000 example_5_jobs
# sbatch --wait --verbose --array=2001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 4000 example_5_jobs
# example_6_jobs: 06_irr_dv.R irr_dv_simulation_base.R grid_sweep.py pl_methods.R
# sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 06_irr_dv.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_6.feather"], "y_explained_variance":${irr_explained_variances},"coder_accuracy":${irr_coder_accuracy}}' --outfile example_6_jobs
# example_6.feather:example_6_jobs
# rm -f example_6.feather
# sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_6_jobs
# sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 1000 example_6_jobs
# sbatch --wait --verbose --array=2001-$(shell cat example_6_jobs | wc -l) run_simulation.sbatch 2000 example_6_jobs
remember_irr.RDS: example_5.feather plot_irr_example.R plot_irr_dv_example.R summarize_estimator.R
rm -f remember_irr.RDS
sbatch --wait --verbose run_job.sbatch Rscript plot_irr_example.R --infile example_5.feather --name "plot.df.example.5"
# sbatch --wait --verbose run_job.sbatch Rscript plot_irr_dv_example.R --infile example_6.feather --name "plot.df.example.6"
robustness_1_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs
@ -210,7 +175,7 @@ robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.
robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R grid_sweep.py
rm -f $@
${srun} $< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
${srun} $< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4
$(eval END_1!=cat robustness_2_dv_jobs_p1 | wc -l)
@ -361,8 +326,22 @@ robustness_4_dv.RDS: plot_dv_example.R robustness_4_dv.feather
rm -f $@
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_4" --remember-file $@
clean_main:
rm -f remembr.RDS
rm -f example_1_jobs
rm -f example_2_jobs
rm -f example_3_jobs
rm -f example_4_jobs
rm -f example_1.feather
rm -f example_2.feather
rm -f example_3.feather
rm -f example_4.feather
#
clean:
clean_all:
rm *.feather
rm -f remembr.RDS
rm -f remembr*.RDS

View File

@ -2,11 +2,10 @@
Tests how robust the MLE method for independent variables with differential error is when the model for $X$ is less precise. In the main paper, we include $Z$ on the right-hand-side of the `truth_formula`.
In this robustness check, the `truth_formula` is an intercept-only model.
The stats are in the list named `robustness_1` in the `.RDS` file.
The stats are in the list named `robustness_1` in the `.RDS`
# robustness\_1\_dv.RDS
Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `outcome_formula`. In this robustness check, we do not.
Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `proxy_formula`. In this robustness check, we do not.
# robustness_2.RDS

View File

@ -280,79 +280,83 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL
Bxy.ci.lower.naive = naive.ci.Bxy[1],
Bzy.ci.upper.naive = naive.ci.Bzy[2],
Bzy.ci.lower.naive = naive.ci.Bzy[1]))
amelia_result <- list(
Bxy.est.amelia.full = NULL,
Bxy.ci.upper.amelia.full = NULL,
Bxy.ci.lower.amelia.full = NULL,
Bzy.est.amelia.full = NULL,
Bzy.ci.upper.amelia.full = NULL,
Bzy.ci.lower.amelia.full = NULL
)
tryCatch({
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
(coefse <- combine_coef_se(mod.amelia.k))
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
est.z.mi <- coefse['z','Estimate']
est.z.se <- coefse['z','Std.Error']
amelia_result <- list(Bxy.est.amelia.full = est.x.mi,
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se,
Bzy.est.amelia.full = est.z.mi,
Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
)
},
error = function(e){
result[['error']] <- e}
)
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
(coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.full = est.x.mi,
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
))
est.z.mi <- coefse['z','Estimate']
est.z.se <- coefse['z','Std.Error']
result <- append(result,
list(Bzy.est.amelia.full = est.z.mi,
Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
))
result <- append(result, amelia_result)
mle_result <- list(Bxy.est.mle = NULL,
Bxy.ci.upper.mle = NULL,
Bxy.ci.lower.mle = NULL,
Bzy.est.mle = NULL,
Bzy.ci.upper.mle = NULL,
Bzy.ci.lower.mle = NULL)
tryCatch({
temp.df <- copy(df)
temp.df <- temp.df[,x:=x.obs]
mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
## tryCatch({
## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient')
## (mod.calibrated.mle)
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.ci.upper.mecor = mecor.ci['UCI'],
## Bxy.ci.lower.mecor = mecor.ci['LCI'])
## )
fischer.info <- NA
ci.upper <- NA
ci.lower <- NA
tryCatch({fischer.info <- solve(mod.caroll.lik$hessian)
fischer.info <- solve(mod.caroll.lik$hessian)
coef <- mod.caroll.lik$par
ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
mle_result <- list(Bxy.est.mle = coef['x'],
Bxy.ci.upper.mle = ci.upper['x'],
Bxy.ci.lower.mle = ci.lower['x'],
Bzy.est.mle = coef['z'],
Bzy.ci.upper.mle = ci.upper['z'],
Bzy.ci.lower.mle = ci.lower['z'])
},
error=function(e) {result[['error']] <- as.character(e)
})
coef <- mod.caroll.lik$par
result <- append(result,
list(Bxy.est.mle = coef['x'],
Bxy.ci.upper.mle = ci.upper['x'],
Bxy.ci.lower.mle = ci.lower['x'],
Bzy.est.mle = coef['z'],
Bzy.ci.upper.mle = ci.upper['z'],
Bzy.ci.lower.mle = ci.lower['z']))
result <- append(result, mle_result)
mod.zhang.lik <- zhang.mle.iv(df)
coef <- coef(mod.zhang.lik)
ci <- confint(mod.zhang.lik,method='quad')
result <- append(result,
list(Bxy.est.zhang = coef['Bxy'],
Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
Bzy.est.zhang = coef['Bzy'],
Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
mod.zhang.lik <- zhang.mle.iv(df)
coef <- coef(mod.zhang.lik)
ci <- confint(mod.zhang.lik,method='quad')
result <- append(result,
list(Bxy.est.zhang = coef['Bxy'],
Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
Bzy.est.zhang = coef['Bzy'],
Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)

View File

@ -1,17 +1,21 @@
summarize.estimator <- function(df, suffix='naive', coefname='x'){
part <- df[,c('N',
'm',
'Bxy',
paste0('B',coefname,'y.est.',suffix),
paste0('B',coefname,'y.ci.lower.',suffix),
paste0('B',coefname,'y.ci.upper.',suffix),
'y_explained_variance',
'Bzx',
'Bzy',
'accuracy_imbalance_difference'
),
reported_vars <- c(
'Bxy',
paste0('B',coefname,'y.est.',suffix),
paste0('B',coefname,'y.ci.lower.',suffix),
paste0('B',coefname,'y.ci.upper.',suffix)
)
grouping_vars <- c('N','m','B0', 'Bxy', 'Bzy', 'Bzx', 'Px', 'y_explained_variance', 'prediction_accuracy','outcome_formula','proxy_formula','truth_formula','z_bias','y_bias')
grouping_vars <- grouping_vars[grouping_vars %in% names(df)]
part <- df[,
c(reported_vars,
grouping_vars),
with=FALSE]
@ -27,8 +31,8 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){
part.plot <- part[, .(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.975,na.rm=T),
est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.025,na.rm=T),
mean.ci.upper = mean(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T),
@ -43,7 +47,7 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){
variable=coefname,
method=suffix
),
by=c("N","m",'y_explained_variance','Bzx', 'Bzy', 'accuracy_imbalance_difference')
by=grouping_vars,
]
return(part.plot)