1
0

add missing simulation code

This commit is contained in:
Nathan TeBlunthuis 2022-06-14 20:35:50 -07:00
parent 6057688060
commit e41d11afb9
16 changed files with 2050 additions and 0 deletions

61
simulations/Makefile Normal file
View File

@ -0,0 +1,61 @@
SHELL=bash
Ns=[1000,10000,25000]
ms=[50, 100, 250, 500]
seeds=[$(shell seq -s, 1 250)]
all:remembr.RDS
srun=srun -A comdata -p compute-bigmem --time=10:00:00 --mem 4G -c 1
example_1_jobs: example_1.R
grid_sweep.py --command "Rscript example_1.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"]}' --outfile example_1_jobs
example_1.feather: example_1_jobs
rm -f example_1.feather
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_1_jobs
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_1_jobs
example_2_jobs: example_2.R
grid_sweep.py --command "Rscript example_2.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"]}' --outfile example_2_jobs
example_2.feather: example_2_jobs
rm -f example_2.feather
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_jobs
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_jobs
example_2_B_jobs: example_2_B.R
grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs
example_2_B.feather: example_2_B_jobs
rm -f example_2_B.feather
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs
example_3_jobs: example_3.R
grid_sweep.py --command "Rscript example_3.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_3.feather"]}' --outfile example_3_jobs
example_3.feather: example_3_jobs
rm -f example_3.feather
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_3_jobs
remembr.RDS:example_2_B.feather example_1.feather example_2.feather example_3.feather
${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
${srun} Rscript plot_example.R --infile example_2.feather --name "plot.df.example.2"
${srun} Rscript plot_example.R --infile example_2_B.feather --name "plot.df.example.2B"
${srun} Rscript plot_example.R --infile example_3.feather --name "plot.df.example.3"
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs
# example_2_B_mecor_jobs:
# grid_sweep.py --command "Rscript example_2_B_mecor.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B_mecor.feather"]}' --outfile example_2_B_mecor_jobs
# example_2_B_mecor.feather:example_2_B_mecor.R example_2_B_mecor_jobs
# rm -f example_2_B_mecor.feather
# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs

203
simulations/example_1.R Normal file
View File

@ -0,0 +1,203 @@
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
source("simulation_base.R")
## SETUP:
### we want to estimate x -> y; x is MAR
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### The labels x are binary, but the model provides a continuous predictor
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
logistic <- function(x) {1/(1+exp(-1*x))}
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
xprime <- Bgx * g + rnorm(N,0,1)
k <- Bkx*xprime + rnorm(N,0,3)
x <- as.integer(logistic(scale(xprime)) > 0.5)
y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0
df <- data.table(x=x,k=k,y=y,g=g)
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
## y = B0 + B1x + e
df[,':='(w=w, w_pred = as.integer(w>0.5))]
return(df)
}
parser <- arg_parser("Simulate data and fit corrected models")
parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
parser <- add_argument(parser, "--m", default=100, help="m the number of ground truth observations")
parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
args <- parse_args(parser)
B0 <- 0
Bxy <- 0.2
Bgy <- 0
Bkx <- 3.2
Bgx <- 0
df <- simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
outline <- run_simulation(df
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=0, 'Bkx'=Bkx, 'Bgx'=0, 'seed'=args$seed))
outfile_lock <- lock(paste0(args$outfile, '_lock'))
if(file.exists(args$outfile)){
logdata <- read_feather(args$outfile)
logdata <- rbind(logdata,as.data.table(outline))
} else {
logdata <- as.data.table(outline)
}
print(outline)
write_feather(logdata, args$outfile)
unlock(outfile_lock)
## for(N in Ns){
## print(N)
## for(m in ms){
## if(N>m){
## for(seed in seeds){
## rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bkx, seed)))
## }
## }
## }
## }
## run_simulation <- function(N, m, B0, Bxy, Bkx, seed){
## result <- list()
## df <- simulate_latent_cocause(N, m, B0, Bxy, Bkx, seed)
## result <- append(result, list(N=N,
## m=m,
## B0=B0,
## Bxy=Bxy,
## Bkx=Bkx,
## seed=seed))
## (correlation <- cor(df$w,df$x,method='spearman'))
## result <- append(result, list(correlation=correlation))
## (accuracy <- mean(df$x == df$w_pred))
## result <- append(result, list(accuracy=accuracy))
## (model.true <- lm(y ~ x, data=df))
## (cor(resid(model.true),df$w))
## true.ci.Bxy <- confint(model.true)['x',]
## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
## Bxy.ci.upper.true = true.ci.Bxy[2],
## Bxy.ci.lower.true = true.ci.Bxy[1]))
## (model.naive <- lm(y~w, data=df))
## (model.feasible <- lm(y~x.obs,data=df))
## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
## naive.ci.Bxy <- confint(model.naive)['w',]
## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
## Bxy.ci.upper.naive = naive.ci.Bxy[2],
## Bxy.ci.lower.naive = naive.ci.Bxy[1]))
## ## multiple imputation when k is observed
## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'), noms=c("x.obs"),lgstc=c('w'))
## mod.amelia.k <- zelig(y~x.obs, model='ls', data=amelia.out.k$imputations, cite=FALSE)
## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
## est.x.mi <- coefse['x.obs','Estimate']
## est.x.se <- coefse['x.obs','Std.Error']
## result <- append(result,
## list(Bxy.est.amelia.full = est.x.mi,
## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
## ))
## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"),noms=c("x.obs"),lgstc=c('w'))
## mod.amelia.nok <- zelig(y~x.obs, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
## est.x.mi <- coefse['x.obs','Estimate']
## est.x.se <- coefse['x.obs','Std.Error']
## result <- append(result,
## list(Bxy.est.amelia.nok = est.x.mi,
## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
## ))
## p <- v <- train <- rep(0,N)
## M <- m
## p[(M+1):(N)] <- 1
## v[1:(M)] <- 1
## df <- df[order(x.obs)]
## y <- df[,y]
## x <- df[,x.obs]
## w <- df[,w]
## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
## result <- append(result,
## list(Bxy.est.gmm = gmm.res$beta[1,1],
## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs), df, B=400, method='efficient')
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.upper.mecor = mecor.ci['UCI'],
## Bxy.lower.mecor = mecor.ci['LCI'])
## )
## return(result)
## }

Binary file not shown.

87
simulations/example_2.R Normal file
View File

@ -0,0 +1,87 @@
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
options(amelia.parallel="no",
amelia.ncpus=1)
## SETUP:
### we want to estimate x -> y; x is MAR
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### The labels x are binary, but the model provides a continuous predictor
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
source("simulation_base.R")
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
k <- rnorm(N, 0, 1)
xprime <- Bkx*k + Bgx * g + rnorm(N,0,1)
xvec <- scale(xprime)
y <- Bxy * xvec + Bgy * g + rnorm(N, 0, 1) + B0
df <- data.table(x=xvec,k=k,y=y,g=g)
names(df) <- c('x','k','y','g')
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
w.model <- lm(x ~ k,df)
w <- predict(w.model,data.frame(k=k))
w <- logistic(w + rnorm(N,0,sd(w)*0.1))
## y = B0 + B1x + e
df[,':='(w=w, w_pred = as.integer(w>0.5))]
return(df)
}
parser <- arg_parser("Simulate data and fit corrected models")
parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
args <- parse_args(parser)
Ns <- c(1000, 10000, 1e6)
ms <- c(100, 250, 500, 1000)
B0 <- 0
Bxy <- 0.2
Bgy <- -0.2
Bkx <- 2
Bgx <- 3
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
if(file.exists(args$outfile)){
logdata <- read_feather(args$outfile)
logdata <- rbind(logdata,as.data.table(outline))
} else {
logdata <- as.data.table(outline)
}
print(outline)
write_feather(logdata, args$outfile)
unlock(outfile_lock)

276
simulations/example_2_B.R Normal file
View File

@ -0,0 +1,276 @@
### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
### This is the same as example 2, only instead of x->k we have k->x.
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
options(amelia.parallel="no",
amelia.ncpus=1)
source("simulation_base.R")
## SETUP:
### we want to estimate x -> y; x is MAR
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### The labels x are binary, but the model provides a continuous predictor
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
logistic <- function(x) {1/(1+exp(-1*x))}
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
xprime <- Bgx * g + rnorm(N,0,1)
k <- Bkx*xprime + rnorm(N,0,3)
x <- as.integer(logistic(scale(xprime)) > 0.5)
y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0
df <- data.table(x=x,k=k,y=y,g=g)
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
## y = B0 + B1x + e
df[,':='(w=w, w_pred = as.integer(w>0.5))]
return(df)
}
schennach <- function(df){
fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
df[,xstar_pred := predict(fwx, df)]
gxt <- lm(y ~ xstar_pred+g, df)
}
parser <- arg_parser("Simulate data and fit corrected models")
parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
args <- parse_args(parser)
rows <- list()
B0 <- 0
Bxy <- 0.2
Bgy <- -0.2
Bkx <- 1
Bgx <- 3
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
if(file.exists(args$outfile)){
logdata <- read_feather(args$outfile)
logdata <- rbind(logdata,as.data.table(outline))
} else {
logdata <- as.data.table(outline)
}
print(outline)
write_feather(logdata, args$outfile)
unlock(outfile_lock)
## Ns <- c(1e6, 5e4, 1000)
## ms <- c(100, 250, 500, 1000)
## seeds <- 1:500
## rowssets <- list()
## library(doParallel)
## options(mc.cores = parallel::detectCores())
## cl <- makeCluster(20)
## registerDoParallel(cl)
## ## library(future)
## ## plan(multiprocess,workers=40,gc=TRUE)
## for(N in Ns){
## print(N)
## for(m in ms){
## if(N>m){
## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
## rowsets <- append(rowssets, list(data.table(new.rows)))
## }
## }
## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
## }
## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
## packages=c('mecor','Amelia','Zelig','predictionError'),
## seed=TRUE)))
## df <- rbindlist(rowsets)
## write_feather(df,"example_2B_simulation.feather")
## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
## result <- list()
## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
## result <- append(result, list(N=N,
## m=m,
## B0=B0,
## Bxy=Bxy,
## Bgy=Bgy,
## Bkx=Bkx,
## seed=seed))
## (accuracy <- df[,.(mean(w_pred==x))])
## result <- append(result, list(accuracy=accuracy))
## (model.true <- lm(y ~ x + g, data=df))
## true.ci.Bxy <- confint(model.true)['x',]
## true.ci.Bgy <- confint(model.true)['g',]
## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
## Bgy.est.true=coef(model.true)['g'],
## Bxy.ci.upper.true = true.ci.Bxy[2],
## Bxy.ci.lower.true = true.ci.Bxy[1],
## Bgy.ci.upper.true = true.ci.Bgy[2],
## Bgy.ci.lower.true = true.ci.Bgy[1]))
## (model.feasible <- lm(y~x.obs+g,data=df))
## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
## feasible.ci.Bgy <- confint(model.feasible)['g',]
## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
## Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
## Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
## (model.naive <- lm(y~w+g, data=df))
## naive.ci.Bxy <- confint(model.naive)['w',]
## naive.ci.Bgy <- confint(model.naive)['g',]
## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
## Bgy.est.naive=coef(model.naive)['g'],
## Bxy.ci.upper.naive = naive.ci.Bxy[2],
## Bxy.ci.lower.naive = naive.ci.Bxy[1],
## Bgy.ci.upper.naive = naive.ci.Bgy[2],
## Bgy.ci.lower.naive = naive.ci.Bgy[1]))
## ## multiple imputation when k is observed
## ## amelia does great at this one.
## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
## est.x.mi <- coefse['x.obs','Estimate']
## est.x.se <- coefse['x.obs','Std.Error']
## result <- append(result,
## list(Bxy.est.amelia.full = est.x.mi,
## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
## ))
## est.g.mi <- coefse['g','Estimate']
## est.g.se <- coefse['g','Std.Error']
## result <- append(result,
## list(Bgy.est.amelia.full = est.g.mi,
## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
## ))
## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
## est.x.mi <- coefse['x.obs','Estimate']
## est.x.se <- coefse['x.obs','Std.Error']
## result <- append(result,
## list(Bxy.est.amelia.nok = est.x.mi,
## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
## ))
## est.g.mi <- coefse['g','Estimate']
## est.g.se <- coefse['g','Std.Error']
## result <- append(result,
## list(Bgy.est.amelia.nok = est.g.mi,
## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
## ))
## p <- v <- train <- rep(0,N)
## M <- m
## p[(M+1):(N)] <- 1
## v[1:(M)] <- 1
## df <- df[order(x.obs)]
## y <- df[,y]
## x <- df[,x.obs]
## g <- df[,g]
## w <- df[,w]
## # gmm gets pretty close
## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
## result <- append(result,
## list(Bxy.est.gmm = gmm.res$beta[1,1],
## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
## result <- append(result,
## list(Bgy.est.gmm = gmm.res$beta[2,1],
## Bgy.ci.upper.gmm = gmm.res$confint[2,2],
## Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
## (mod.calibrated.mle)
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.upper.mecor = mecor.ci['UCI'],
## Bxy.lower.mecor = mecor.ci['LCI'])
## )
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.upper.mecor = mecor.ci['UCI'],
## Bxy.lower.mecor = mecor.ci['LCI'])
## )
## return(result)
## }

Binary file not shown.

View File

@ -0,0 +1,281 @@
### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
### This is the same as example 2, only instead of x->k we have k->x.
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
options(amelia.parallel="no",
amelia.ncpus=1)
source("simulation_base_mecor.R")
## SETUP:
### we want to estimate x -> y; x is MAR
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### The labels x are binary, but the model provides a continuous predictor
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
logistic <- function(x) {1/(1+exp(-1*x))}
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
xprime <- Bgx * g + rnorm(N,0,1)
k <- Bkx*xprime + rnorm(N,0,3)
x <- as.integer(logistic(scale(xprime)) > 0.5)
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
df <- data.table(x=x,k=k,y=y,g=g)
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
## y = B0 + B1x + e
df[,':='(w=w, w_pred = as.integer(w>0.5))]
return(df)
}
schennach <- function(df){
fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
df[,xstar_pred := predict(fwx, df)]
gxt <- lm(y ~ xstar_pred+g, df)
}
parser <- arg_parser("Simulate data and fit corrected models")
parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
args <- parse_args(parser)
B0 <- 0
Bxy <- 0.2
Bkx <- 0.5
rows <- list()
B0 <- 0
Bxy <- 0.4
Bgy <- 0.25
Bkx <- 3.2
Bgx <- -0.5
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
outfile_lock <- lock(paste0(args$outfile, '_lock'))
if(file.exists(args$outfile)){
logdata <- read_feather(args$outfile)
logdata <- rbind(logdata,as.data.table(outline))
} else {
logdata <- as.data.table(outline)
}
print(outline)
write_feather(logdata, args$outfile)
unlock(outfile_lock)
## Ns <- c(1e6, 5e4, 1000)
## ms <- c(100, 250, 500, 1000)
## seeds <- 1:500
## rowssets <- list()
## library(doParallel)
## options(mc.cores = parallel::detectCores())
## cl <- makeCluster(20)
## registerDoParallel(cl)
## ## library(future)
## ## plan(multiprocess,workers=40,gc=TRUE)
## for(N in Ns){
## print(N)
## for(m in ms){
## if(N>m){
## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
## rowsets <- append(rowssets, list(data.table(new.rows)))
## }
## }
## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
## }
## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
## packages=c('mecor','Amelia','Zelig','predictionError'),
## seed=TRUE)))
## df <- rbindlist(rowsets)
## write_feather(df,"example_2B_simulation.feather")
## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
## result <- list()
## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
## result <- append(result, list(N=N,
## m=m,
## B0=B0,
## Bxy=Bxy,
## Bgy=Bgy,
## Bkx=Bkx,
## seed=seed))
## (accuracy <- df[,.(mean(w_pred==x))])
## result <- append(result, list(accuracy=accuracy))
## (model.true <- lm(y ~ x + g, data=df))
## true.ci.Bxy <- confint(model.true)['x',]
## true.ci.Bgy <- confint(model.true)['g',]
## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
## Bgy.est.true=coef(model.true)['g'],
## Bxy.ci.upper.true = true.ci.Bxy[2],
## Bxy.ci.lower.true = true.ci.Bxy[1],
## Bgy.ci.upper.true = true.ci.Bgy[2],
## Bgy.ci.lower.true = true.ci.Bgy[1]))
## (model.feasible <- lm(y~x.obs+g,data=df))
## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
## feasible.ci.Bgy <- confint(model.feasible)['g',]
## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
## Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
## Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
## (model.naive <- lm(y~w+g, data=df))
## naive.ci.Bxy <- confint(model.naive)['w',]
## naive.ci.Bgy <- confint(model.naive)['g',]
## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
## Bgy.est.naive=coef(model.naive)['g'],
## Bxy.ci.upper.naive = naive.ci.Bxy[2],
## Bxy.ci.lower.naive = naive.ci.Bxy[1],
## Bgy.ci.upper.naive = naive.ci.Bgy[2],
## Bgy.ci.lower.naive = naive.ci.Bgy[1]))
## ## multiple imputation when k is observed
## ## amelia does great at this one.
## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
## est.x.mi <- coefse['x.obs','Estimate']
## est.x.se <- coefse['x.obs','Std.Error']
## result <- append(result,
## list(Bxy.est.amelia.full = est.x.mi,
## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
## ))
## est.g.mi <- coefse['g','Estimate']
## est.g.se <- coefse['g','Std.Error']
## result <- append(result,
## list(Bgy.est.amelia.full = est.g.mi,
## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
## ))
## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
## est.x.mi <- coefse['x.obs','Estimate']
## est.x.se <- coefse['x.obs','Std.Error']
## result <- append(result,
## list(Bxy.est.amelia.nok = est.x.mi,
## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
## ))
## est.g.mi <- coefse['g','Estimate']
## est.g.se <- coefse['g','Std.Error']
## result <- append(result,
## list(Bgy.est.amelia.nok = est.g.mi,
## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
## ))
## p <- v <- train <- rep(0,N)
## M <- m
## p[(M+1):(N)] <- 1
## v[1:(M)] <- 1
## df <- df[order(x.obs)]
## y <- df[,y]
## x <- df[,x.obs]
## g <- df[,g]
## w <- df[,w]
## # gmm gets pretty close
## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
## result <- append(result,
## list(Bxy.est.gmm = gmm.res$beta[1,1],
## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
## result <- append(result,
## list(Bgy.est.gmm = gmm.res$beta[2,1],
## Bgy.ci.upper.gmm = gmm.res$confint[2,2],
## Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
## (mod.calibrated.mle)
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.upper.mecor = mecor.ci['UCI'],
## Bxy.lower.mecor = mecor.ci['LCI'])
## )
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
## result <- append(result, list(
## Bxy.est.mecor = mecor.ci['Estimate'],
## Bxy.upper.mecor = mecor.ci['UCI'],
## Bxy.lower.mecor = mecor.ci['LCI'])
## )
## return(result)
## }

View File

@ -0,0 +1,187 @@
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
options(amelia.parallel="multicore",
amelia.ncpus=40)
## SETUP:
### we want to estimate g -> y and x -> y; g is observed, x is MAR
### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
### we have k -> w; x -> w; w is observed.
### for illustration, g is binary (e.g., gender==male).
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race.
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
logistic <- function(x) {1/(1+exp(-1*x))}
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
k <- rnorm(N, 0, 1)
x <- Bkx*k + Bgx * g + rnorm(N,0,1)
w.model <- lm(x ~ k)
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
## y = B0 + B1x + e
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
df <- data.table(x=x,k=k,y=y,w=w,g=g)
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
return(df)
}
run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
result <- list()
df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
result <- append(result, list(N=N,
m=m,
B0=B0,
Bxy=Bxy,
Bgy=Bgy,
Bkx=Bkx,
seed=seed))
correlation <- cor(df$w,df$x)
result <- append(result, list(correlation=correlation))
model.true <- lm(y ~ x + g, data=df)
true.ci.Bxy <- confint(model.true)['x',]
true.ci.Bgy <- confint(model.true)['g',]
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
Bgy.est.true=coef(model.true)['g'],
Bxy.ci.upper.true = true.ci.Bxy[2],
Bxy.ci.lower.true = true.ci.Bxy[1],
Bgy.ci.upper.true = true.ci.Bgy[2],
Bgy.ci.lower.true = true.ci.Bgy[1]))
model.naive <- lm(y~w+g, data=df)
naive.ci.Bxy <- confint(model.naive)['w',]
naive.ci.Bgy <- confint(model.naive)['g',]
result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
Bgy.est.naive=coef(model.naive)['g'],
Bxy.ci.upper.naive = naive.ci.Bxy[2],
Bxy.ci.lower.naive = naive.ci.Bxy[1],
Bgy.ci.upper.naive = naive.ci.Bgy[2],
Bgy.ci.lower.naive = naive.ci.Bgy[1]))
## multiple imputation when k is observed
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'))
mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE)
coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.full = est.x.mi,
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
))
est.g.mi <- coefse['g','Estimate']
est.g.se <- coefse['g','Std.Error']
result <- append(result,
list(Bgy.est.amelia.full = est.g.mi,
Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
))
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"))
mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.nok = est.x.mi,
Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
))
est.g.mi <- coefse['g','Estimate']
est.g.se <- coefse['g','Std.Error']
result <- append(result,
list(Bgy.est.amelia.nok = est.g.mi,
Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
))
p <- v <- train <- rep(0,N)
M <- m
p[(M+1):(N)] <- 1
train[0:(M/2)] <- 1
v[(M/2+1):(M)] <- 1
df <- df[order(x.obs)]
y <- df[,y]
x <- df[,x.obs]
g <- df[,g]
w <- df[,w]
gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)
result <- append(result,
list(Bxy.est.gmm = gmm.res$beta[1,1],
Bxy.ci.upper.gmm = gmm.res$confint[1,2],
Bxy.ci.lower.gmm = gmm.res$confint[1,1],
Bgy.est.gmm = gmm.res$beta[2,1],
Bgy.ci.upper.gmm = gmm.res$confint[2,2],
Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
return(result)
}
Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
ms <- c(30, 50, 100, 200, 300, 500)
B0 <- 0
Bxy <- 1
Bgy <- 0.3
Bkx <- 3
Bgx <- -4
seeds <- 1:100
rows <- list()
for(N in Ns){
print(N)
for(m in ms){
if(N>m){
for(seed in seeds){
rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
}
}
}
}
result <- rbindlist(rows)
write_feather(result, "example_2_simulation_continuous.feather")

144
simulations/example_3.R Normal file
View File

@ -0,0 +1,144 @@
### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
### What kind of data invalidates fong + tyler?
### This is the same as example 2, only instead of x->k we have k->x.
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
options(amelia.parallel="no",
amelia.ncpus=1)
setDTthreads(40)
source("simulation_base.R")
## SETUP:
### we want to estimate x -> y; x is MAR
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### The labels x are binary, but the model provides a continuous predictor
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
## one way to do it is by adding correlation to x.obs and y that isn't in w.
## in other words, the model is missing an important feature of x.obs that's related to y.
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
# make w and y dependent
u <- rnorm(N,0,Bxy)
xprime <- Bgx * g + rnorm(N,0,1)
k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
x <- as.integer(logistic(scale(xprime)) > 0.5)
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
df <- data.table(x=x,k=k,y=y,g=g)
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
df[, x.obs := x.obs]
w <- predict(w.model, df) + rnorm(N, 0, 1)
## y = B0 + B1x + e
df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
return(df)
}
## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
## set.seed(seed)
## ## the true value of x
## g <- rbinom(N, 1, 0.5)
## # make w and y dependent
## u <- rnorm(N,0,5)
## xprime <- Bgx * g + rnorm(N,0,1)
## k <- Bkx*xprime + rnorm(N,0,3)
## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
## df <- data.table(x=x,k=k,y=y,g=g)
## w.model <- glm(x ~ k, df, family=binomial(link='logit'))
## if( m < N){
## df <- df[sample(nrow(df), m), x.obs := x]
## } else {
## df <- df[, x.obs := x]
## }
## w <- predict(w.model,data.frame(k=k)) + u
## ## y = B0 + B1x + e
## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
## return(df)
## }
schennach <- function(df){
fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
df[,xstar_pred := predict(fwx, df)]
gxt <- lm(y ~ xstar_pred+g, df)
}
parser <- arg_parser("Simulate data and fit corrected models")
parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
args <- parse_args(parser)
B0 <- 0
Bxy <- 0.2
Bgy <- 0
Bkx <- 2
Bgx <- 0
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
if(file.exists(args$outfile)){
logdata <- read_feather(args$outfile)
logdata <- rbind(logdata,as.data.table(outline))
} else {
logdata <- as.data.table(outline)
}
print(outline)
write_feather(logdata, args$outfile)
unlock(outfile_lock)

Binary file not shown.

View File

@ -0,0 +1,175 @@
source("RemembR/R/RemembeR.R")
library(arrow)
library(data.table)
library(ggplot2)
df <- data.table(read_feather("example_1.feather"))
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
bias = Bxy - Bxy.est.naive,
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.estimate=mean(Bxy.est.naive),
var.estimate=var(Bxy.est.naive),
Bxy=mean(Bxy),
variable='x',
method='Naive'
),
by=c('N','m')]
x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
bias = Bxy.est.true - Bxy.est.amelia.full,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Multiple imputation'
),
by=c('N','m')]
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
bias = Bgy.est.amelia.full - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Multiple imputation'
),
by=c('N','m')]
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
bias = Bxy.est.amelia.nok - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
bias = Bgy.est.amelia.nok - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
bias = Bxy.est.mecor - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Regression Calibration'
),
by=c('N','m')]
g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
bias = Bgy.est.mecor - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Regression Calibration'
),
by=c('N','m')]
## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
## bias = Bgy.est.mecor - Bgy.est.true,
## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
## mean.bias = mean(bias),
## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
## variable='g',
## method='Regression Calibration'
## ),
## by=c('N','m')]
x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
bias = Bxy.est.gmm - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='2SLS+gmm'
),
by=c('N','m')]
g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
bias = Bgy.est.gmm - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='2SLS+gmm'
),
by=c('N','m')]
plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot))
remember(plot.df,'example.1.plot.df')
ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
ests <- df[,.(Bxy.est.true = mean(Bxy.est.true),
Bxy.est.naive = mean(Bxy.est.naive),
Bxy.est.feasible = mean(Bxy.est.feasible),
Bxy.est.amelia.full = mean(Bxy.est.amelia.full),
Bxy.est.amelia.nok = mean(Bxy.est.amelia.nok),
Bxy.est.mecor = mean(Bxy.est.mecor),
Bxy.est.gmm = mean(Bxy.est.gmm)),
by=c("N","m")]

View File

@ -0,0 +1,102 @@
library(arrow)
library(data.table)
library(ggplot2)
df <- data.table(read_feather("example_2_simulation.feather"))
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
bias = Bxy - Bxy.est.naive,
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Naive'
),
by=c('N','m')]
g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
bias = Bgy - Bgy.est.naive,
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Naive'
),
by=c('N','m')]
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
bias = Bxy.est.true - Bxy.est.amelia.full,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Multiple imputation'
),
by=c('N','m')]
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
bias = Bgy.est.amelia.full - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Multiple imputation'
),
by=c('N','m')]
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
bias = Bxy.est.amelia.nok - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
bias = Bgy.est.amelia.nok - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
kggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")

View File

@ -0,0 +1,277 @@
source("RemembR/R/RemembeR.R")
library(arrow)
library(data.table)
library(ggplot2)
library(filelock)
library(argparse)
l <- filelock::lock("example_2_B.feather_lock",exclusive=FALSE)
df <- data.table(read_feather("example_2_B.feather"))
filelock::unlock(l)
parser <- arg_parser("Simulate data and fit corrected models.")
parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
args <- parse_args(parser)
build_plot_dataset <- function(df){
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
bias = Bxy - Bxy.est.naive,
Bxy.est.naive = Bxy.est.naive,
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
mean.est = mean(Bxy.est.naive),
var.est = var(Bxy.est.naive),
N.sims = .N,
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Naive'
),
by=c('N','m')]
g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
bias = Bgy - Bgy.est.naive,
Bgy.est.naive = Bgy.est.naive,
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
mean.est = mean(Bgy.est.naive),
var.est = var(Bgy.est.naive),
N.sims = .N,
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Naive'
),
by=c('N','m')]
x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
bias = Bxy - Bxy.est.feasible,
Bxy.est.feasible = Bxy.est.feasible,
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
mean.est = mean(Bxy.est.feasible),
var.est = var(Bxy.est.feasible),
N.sims = .N,
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Feasible'
),
by=c('N','m')]
g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
bias = Bgy - Bgy.est.feasible,
Bgy.est.feasible = Bgy.est.feasible,
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
mean.est = mean(Bgy.est.feasible),
var.est = var(Bgy.est.feasible),
N.sims = .N,
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Feasible'
),
by=c('N','m')]
x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
bias = Bxy.est.true - Bxy.est.amelia.full,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bxy.est.amelia.full),
var.est = var(Bxy.est.amelia.full),
N.sims = .N,
variable='x',
method='Multiple imputation'
),
by=c('N','m')]
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
bias = Bgy.est.amelia.full - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bgy.est.amelia.full),
var.est = var(Bgy.est.amelia.full),
N.sims = .N,
variable='g',
method='Multiple imputation'
),
by=c('N','m')]
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
bias = Bxy.est.amelia.nok - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bxy.est.amelia.nok),
var.est = var(Bxy.est.amelia.nok),
N.sims = .N,
variable='x',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
bias = Bgy.est.amelia.nok - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bgy.est.amelia.nok),
var.est = var(Bgy.est.amelia.nok),
N.sims = .N,
variable='g',
method="Multiple imputation (Classifier features unobserved)"
),
by=c('N','m')]
x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
bias = Bxy.est.mecor - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bxy.est.mecor),
var.est = var(Bxy.est.mecor),
N.sims = .N,
variable='x',
method='Regression Calibration'
),
by=c('N','m')]
g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
bias = Bgy.est.mecor - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bgy.est.mecor),
var.est = var(Bgy.est.mecor),
N.sims = .N,
variable='g',
method='Regression Calibration'
),
by=c('N','m')]
## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
## bias = Bgy.est.mecor - Bgy.est.true,
## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
## mean.bias = mean(bias),
## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
## variable='g',
## method='Regression Calibration'
## ),
## by=c('N','m')]
x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
bias = Bxy.est.gmm - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bxy.est.gmm),
var.est = var(Bxy.est.gmm),
N.sims = .N,
variable='x',
method='2SLS+gmm'
),
by=c('N','m')]
g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
bias = Bgy.est.gmm - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
mean.est = mean(Bgy.est.gmm),
var.est = var(Bgy.est.gmm),
N.sims = .N,
variable='g',
method='2SLS+gmm'
),
by=c('N','m')]
accuracy <- df[,mean(accuracy)]
return(plot.df)
}
df <- read_feather(args$infile)
plot.df <- build_plot_dataset(df)
remember(plot.df,args$name))
## df[gmm.ER_pval<0.05]
## plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T)
## plot.df[,accuracy := accuracy]
## # plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")

View File

@ -0,0 +1,102 @@
library(arrow)
library(data.table)
library(ggplot2)
df <- data.table(read_feather("example_2_simulation.feather"))
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
bias = Bxy - Bxy.est.naive,
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Naive'
),
by=c('N','m')]
g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
bias = Bgy - Bgy.est.naive,
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Naive'
),
by=c('N','m')]
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
bias = Bxy.est.true - Bxy.est.amelia.full,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Multiple imputation'
),
by=c('N','m')]
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
bias = Bgy.est.amelia.full - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Multiple imputation'
),
by=c('N','m')]
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
bias = Bxy.est.amelia.nok - Bxy.est.true,
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='x',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
bias = Bgy.est.amelia.nok - Bgy.est.true,
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
mean.bias = mean(bias),
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
variable='g',
method='Multiple imputation (Classifier features unobserved)'
),
by=c('N','m')]
plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
ggplot(plot.df,aes(y=N,x=m,color=mean.bias)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")

View File

@ -0,0 +1,155 @@
library(predictionError)
library(mecor)
options(amelia.parallel="no",
amelia.ncpus=1)
library(Amelia)
library(Zelig)
logistic <- function(x) {1/(1+exp(-1*x))}
run_simulation <- function(df, result){
accuracy <- df[,mean(w_pred==x)]
result <- append(result, list(accuracy=accuracy))
(model.true <- lm(y ~ x + g, data=df))
true.ci.Bxy <- confint(model.true)['x',]
true.ci.Bgy <- confint(model.true)['g',]
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
Bgy.est.true=coef(model.true)['g'],
Bxy.ci.upper.true = true.ci.Bxy[2],
Bxy.ci.lower.true = true.ci.Bxy[1],
Bgy.ci.upper.true = true.ci.Bgy[2],
Bgy.ci.lower.true = true.ci.Bgy[1]))
(model.feasible <- lm(y~x.obs+g,data=df))
feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
feasible.ci.Bgy <- confint(model.feasible)['g',]
result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
(model.naive <- lm(y~w+g, data=df))
naive.ci.Bxy <- confint(model.naive)['w',]
naive.ci.Bgy <- confint(model.naive)['g',]
result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
Bgy.est.naive=coef(model.naive)['g'],
Bxy.ci.upper.naive = naive.ci.Bxy[2],
Bxy.ci.lower.naive = naive.ci.Bxy[1],
Bgy.ci.upper.naive = naive.ci.Bgy[2],
Bgy.ci.lower.naive = naive.ci.Bgy[1]))
## multiple imputation when k is observed
## amelia does great at this one.
noms <- c()
if(length(unique(df$x.obs)) <=2){
noms <- c(noms, 'x.obs')
}
if(length(unique(df$g)) <=2){
noms <- c(noms, 'g')
}
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=noms)
mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
(coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.full = est.x.mi,
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
))
est.g.mi <- coefse['g','Estimate']
est.g.se <- coefse['g','Std.Error']
result <- append(result,
list(Bgy.est.amelia.full = est.g.mi,
Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
))
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"), noms=noms)
mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
(coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.nok = est.x.mi,
Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
))
est.g.mi <- coefse['g','Estimate']
est.g.se <- coefse['g','Std.Error']
result <- append(result,
list(Bgy.est.amelia.nok = est.g.mi,
Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
))
N <- nrow(df)
m <- nrow(df[!is.na(x.obs)])
p <- v <- train <- rep(0,N)
M <- m
p[(M+1):(N)] <- 1
v[1:(M)] <- 1
df <- df[order(x.obs)]
y <- df[,y]
x <- df[,x.obs]
g <- df[,g]
w <- df[,w]
# gmm gets pretty close
(gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE))
result <- append(result,
list(Bxy.est.gmm = gmm.res$beta[1,1],
Bxy.ci.upper.gmm = gmm.res$confint[1,2],
Bxy.ci.lower.gmm = gmm.res$confint[1,1],
gmm.ER_pval = gmm.res$ER_pval
))
result <- append(result,
list(Bgy.est.gmm = gmm.res$beta[2,1],
Bgy.ci.upper.gmm = gmm.res$confint[2,2],
Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
(mod.calibrated.mle)
(mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
result <- append(result, list(
Bxy.est.mecor = mecor.ci['Estimate'],
Bxy.upper.mecor = mecor.ci['UCI'],
Bxy.lower.mecor = mecor.ci['LCI'])
)
(mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
result <- append(result, list(
Bgy.est.mecor = mecor.ci['Estimate'],
Bgy.upper.mecor = mecor.ci['UCI'],
Bgy.lower.mecor = mecor.ci['LCI'])
)
## clean up memory
rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))
gc()
return(result)
}