add missing simulation code
This commit is contained in:
parent
6057688060
commit
e41d11afb9
61
simulations/Makefile
Normal file
61
simulations/Makefile
Normal file
@ -0,0 +1,61 @@
|
||||
|
||||
SHELL=bash
|
||||
|
||||
Ns=[1000,10000,25000]
|
||||
ms=[50, 100, 250, 500]
|
||||
seeds=[$(shell seq -s, 1 250)]
|
||||
all:remembr.RDS
|
||||
|
||||
srun=srun -A comdata -p compute-bigmem --time=10:00:00 --mem 4G -c 1
|
||||
|
||||
example_1_jobs: example_1.R
|
||||
grid_sweep.py --command "Rscript example_1.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"]}' --outfile example_1_jobs
|
||||
|
||||
example_1.feather: example_1_jobs
|
||||
rm -f example_1.feather
|
||||
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_1_jobs
|
||||
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_1_jobs
|
||||
|
||||
example_2_jobs: example_2.R
|
||||
grid_sweep.py --command "Rscript example_2.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"]}' --outfile example_2_jobs
|
||||
|
||||
example_2.feather: example_2_jobs
|
||||
rm -f example_2.feather
|
||||
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_jobs
|
||||
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_jobs
|
||||
|
||||
example_2_B_jobs: example_2_B.R
|
||||
grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs
|
||||
|
||||
example_2_B.feather: example_2_B_jobs
|
||||
rm -f example_2_B.feather
|
||||
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs
|
||||
|
||||
example_3_jobs: example_3.R
|
||||
grid_sweep.py --command "Rscript example_3.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_3.feather"]}' --outfile example_3_jobs
|
||||
|
||||
example_3.feather: example_3_jobs
|
||||
rm -f example_3.feather
|
||||
sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_3_jobs
|
||||
|
||||
|
||||
remembr.RDS:example_2_B.feather example_1.feather example_2.feather example_3.feather
|
||||
${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
|
||||
${srun} Rscript plot_example.R --infile example_2.feather --name "plot.df.example.2"
|
||||
${srun} Rscript plot_example.R --infile example_2_B.feather --name "plot.df.example.2B"
|
||||
${srun} Rscript plot_example.R --infile example_3.feather --name "plot.df.example.3"
|
||||
|
||||
|
||||
|
||||
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs
|
||||
|
||||
# example_2_B_mecor_jobs:
|
||||
# grid_sweep.py --command "Rscript example_2_B_mecor.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B_mecor.feather"]}' --outfile example_2_B_mecor_jobs
|
||||
|
||||
# example_2_B_mecor.feather:example_2_B_mecor.R example_2_B_mecor_jobs
|
||||
# rm -f example_2_B_mecor.feather
|
||||
# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs
|
||||
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs
|
||||
|
||||
|
||||
|
203
simulations/example_1.R
Normal file
203
simulations/example_1.R
Normal file
@ -0,0 +1,203 @@
|
||||
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||
### Even when you include the proxy variable in the regression.
|
||||
### But with some ground truth and multiple imputation, you can fix it.
|
||||
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
|
||||
source("simulation_base.R")
|
||||
|
||||
## SETUP:
|
||||
### we want to estimate x -> y; x is MAR
|
||||
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
|
||||
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||
### The labels x are binary, but the model provides a continuous predictor
|
||||
|
||||
### simulation:
|
||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||
####
|
||||
logistic <- function(x) {1/(1+exp(-1*x))}
|
||||
|
||||
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
set.seed(seed)
|
||||
|
||||
## the true value of x
|
||||
|
||||
g <- rbinom(N, 1, 0.5)
|
||||
xprime <- Bgx * g + rnorm(N,0,1)
|
||||
|
||||
k <- Bkx*xprime + rnorm(N,0,3)
|
||||
|
||||
x <- as.integer(logistic(scale(xprime)) > 0.5)
|
||||
|
||||
y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0
|
||||
df <- data.table(x=x,k=k,y=y,g=g)
|
||||
|
||||
if( m < N){
|
||||
df <- df[sample(nrow(df), m), x.obs := x]
|
||||
} else {
|
||||
df <- df[, x.obs := x]
|
||||
}
|
||||
|
||||
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
|
||||
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
|
||||
## y = B0 + B1x + e
|
||||
|
||||
df[,':='(w=w, w_pred = as.integer(w>0.5))]
|
||||
return(df)
|
||||
}
|
||||
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models")
|
||||
parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
|
||||
parser <- add_argument(parser, "--m", default=100, help="m the number of ground truth observations")
|
||||
parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
|
||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
|
||||
args <- parse_args(parser)
|
||||
|
||||
|
||||
B0 <- 0
|
||||
Bxy <- 0.2
|
||||
Bgy <- 0
|
||||
Bkx <- 3.2
|
||||
Bgx <- 0
|
||||
|
||||
df <- simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
|
||||
|
||||
outline <- run_simulation(df
|
||||
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=0, 'Bkx'=Bkx, 'Bgx'=0, 'seed'=args$seed))
|
||||
|
||||
outfile_lock <- lock(paste0(args$outfile, '_lock'))
|
||||
if(file.exists(args$outfile)){
|
||||
logdata <- read_feather(args$outfile)
|
||||
logdata <- rbind(logdata,as.data.table(outline))
|
||||
} else {
|
||||
logdata <- as.data.table(outline)
|
||||
}
|
||||
|
||||
print(outline)
|
||||
write_feather(logdata, args$outfile)
|
||||
unlock(outfile_lock)
|
||||
|
||||
## for(N in Ns){
|
||||
## print(N)
|
||||
## for(m in ms){
|
||||
## if(N>m){
|
||||
## for(seed in seeds){
|
||||
## rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bkx, seed)))
|
||||
## }
|
||||
## }
|
||||
## }
|
||||
## }
|
||||
|
||||
|
||||
## run_simulation <- function(N, m, B0, Bxy, Bkx, seed){
|
||||
## result <- list()
|
||||
## df <- simulate_latent_cocause(N, m, B0, Bxy, Bkx, seed)
|
||||
|
||||
## result <- append(result, list(N=N,
|
||||
## m=m,
|
||||
## B0=B0,
|
||||
## Bxy=Bxy,
|
||||
## Bkx=Bkx,
|
||||
## seed=seed))
|
||||
|
||||
## (correlation <- cor(df$w,df$x,method='spearman'))
|
||||
## result <- append(result, list(correlation=correlation))
|
||||
|
||||
## (accuracy <- mean(df$x == df$w_pred))
|
||||
|
||||
## result <- append(result, list(accuracy=accuracy))
|
||||
|
||||
## (model.true <- lm(y ~ x, data=df))
|
||||
|
||||
## (cor(resid(model.true),df$w))
|
||||
|
||||
## true.ci.Bxy <- confint(model.true)['x',]
|
||||
|
||||
## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
|
||||
## Bxy.ci.upper.true = true.ci.Bxy[2],
|
||||
## Bxy.ci.lower.true = true.ci.Bxy[1]))
|
||||
|
||||
## (model.naive <- lm(y~w, data=df))
|
||||
|
||||
## (model.feasible <- lm(y~x.obs,data=df))
|
||||
|
||||
## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
|
||||
## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
|
||||
## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
|
||||
## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
|
||||
|
||||
|
||||
## naive.ci.Bxy <- confint(model.naive)['w',]
|
||||
|
||||
## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
|
||||
## Bxy.ci.upper.naive = naive.ci.Bxy[2],
|
||||
## Bxy.ci.lower.naive = naive.ci.Bxy[1]))
|
||||
|
||||
|
||||
## ## multiple imputation when k is observed
|
||||
|
||||
## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'), noms=c("x.obs"),lgstc=c('w'))
|
||||
## mod.amelia.k <- zelig(y~x.obs, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
||||
## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
||||
|
||||
## est.x.mi <- coefse['x.obs','Estimate']
|
||||
## est.x.se <- coefse['x.obs','Std.Error']
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.amelia.full = est.x.mi,
|
||||
## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||
## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
||||
## ))
|
||||
|
||||
|
||||
## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
||||
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"),noms=c("x.obs"),lgstc=c('w'))
|
||||
## mod.amelia.nok <- zelig(y~x.obs, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
|
||||
## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
|
||||
|
||||
## est.x.mi <- coefse['x.obs','Estimate']
|
||||
## est.x.se <- coefse['x.obs','Std.Error']
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.amelia.nok = est.x.mi,
|
||||
## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
|
||||
## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
|
||||
## ))
|
||||
|
||||
## p <- v <- train <- rep(0,N)
|
||||
## M <- m
|
||||
## p[(M+1):(N)] <- 1
|
||||
## v[1:(M)] <- 1
|
||||
## df <- df[order(x.obs)]
|
||||
## y <- df[,y]
|
||||
## x <- df[,x.obs]
|
||||
## w <- df[,w]
|
||||
## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.gmm = gmm.res$beta[1,1],
|
||||
## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
|
||||
## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
|
||||
|
||||
## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs), df, B=400, method='efficient')
|
||||
|
||||
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
|
||||
|
||||
## result <- append(result, list(
|
||||
## Bxy.est.mecor = mecor.ci['Estimate'],
|
||||
## Bxy.upper.mecor = mecor.ci['UCI'],
|
||||
## Bxy.lower.mecor = mecor.ci['LCI'])
|
||||
## )
|
||||
|
||||
|
||||
## return(result)
|
||||
## }
|
||||
|
BIN
simulations/example_1.feather
Normal file
BIN
simulations/example_1.feather
Normal file
Binary file not shown.
87
simulations/example_2.R
Normal file
87
simulations/example_2.R
Normal file
@ -0,0 +1,87 @@
|
||||
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||
### Even when you include the proxy variable in the regression.
|
||||
### But with some ground truth and multiple imputation, you can fix it.
|
||||
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
|
||||
## SETUP:
|
||||
### we want to estimate x -> y; x is MAR
|
||||
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
|
||||
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||
### The labels x are binary, but the model provides a continuous predictor
|
||||
|
||||
### simulation:
|
||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||
####
|
||||
source("simulation_base.R")
|
||||
|
||||
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
set.seed(seed)
|
||||
|
||||
## the true value of x
|
||||
|
||||
g <- rbinom(N, 1, 0.5)
|
||||
k <- rnorm(N, 0, 1)
|
||||
xprime <- Bkx*k + Bgx * g + rnorm(N,0,1)
|
||||
xvec <- scale(xprime)
|
||||
|
||||
y <- Bxy * xvec + Bgy * g + rnorm(N, 0, 1) + B0
|
||||
|
||||
df <- data.table(x=xvec,k=k,y=y,g=g)
|
||||
names(df) <- c('x','k','y','g')
|
||||
if( m < N){
|
||||
df <- df[sample(nrow(df), m), x.obs := x]
|
||||
} else {
|
||||
df <- df[, x.obs := x]
|
||||
}
|
||||
|
||||
w.model <- lm(x ~ k,df)
|
||||
w <- predict(w.model,data.frame(k=k))
|
||||
w <- logistic(w + rnorm(N,0,sd(w)*0.1))
|
||||
## y = B0 + B1x + e
|
||||
|
||||
df[,':='(w=w, w_pred = as.integer(w>0.5))]
|
||||
return(df)
|
||||
}
|
||||
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models")
|
||||
parser <- add_argument(parser, "--N", default=1000, help="number of observations of w")
|
||||
parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
|
||||
parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
|
||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
|
||||
args <- parse_args(parser)
|
||||
|
||||
Ns <- c(1000, 10000, 1e6)
|
||||
ms <- c(100, 250, 500, 1000)
|
||||
B0 <- 0
|
||||
Bxy <- 0.2
|
||||
Bgy <- -0.2
|
||||
Bkx <- 2
|
||||
Bgx <- 3
|
||||
|
||||
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
|
||||
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
|
||||
|
||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||
if(file.exists(args$outfile)){
|
||||
logdata <- read_feather(args$outfile)
|
||||
logdata <- rbind(logdata,as.data.table(outline))
|
||||
} else {
|
||||
logdata <- as.data.table(outline)
|
||||
}
|
||||
|
||||
print(outline)
|
||||
write_feather(logdata, args$outfile)
|
||||
unlock(outfile_lock)
|
276
simulations/example_2_B.R
Normal file
276
simulations/example_2_B.R
Normal file
@ -0,0 +1,276 @@
|
||||
### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||
### This is the same as example 2, only instead of x->k we have k->x.
|
||||
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||
### Even when you include the proxy variable in the regression.
|
||||
### But with some ground truth and multiple imputation, you can fix it.
|
||||
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
|
||||
source("simulation_base.R")
|
||||
|
||||
## SETUP:
|
||||
### we want to estimate x -> y; x is MAR
|
||||
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
|
||||
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||
### The labels x are binary, but the model provides a continuous predictor
|
||||
|
||||
### simulation:
|
||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||
####
|
||||
logistic <- function(x) {1/(1+exp(-1*x))}
|
||||
|
||||
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
set.seed(seed)
|
||||
|
||||
## the true value of x
|
||||
|
||||
g <- rbinom(N, 1, 0.5)
|
||||
xprime <- Bgx * g + rnorm(N,0,1)
|
||||
|
||||
k <- Bkx*xprime + rnorm(N,0,3)
|
||||
|
||||
x <- as.integer(logistic(scale(xprime)) > 0.5)
|
||||
|
||||
y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0
|
||||
df <- data.table(x=x,k=k,y=y,g=g)
|
||||
|
||||
if( m < N){
|
||||
df <- df[sample(nrow(df), m), x.obs := x]
|
||||
} else {
|
||||
df <- df[, x.obs := x]
|
||||
}
|
||||
|
||||
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
|
||||
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
|
||||
## y = B0 + B1x + e
|
||||
|
||||
df[,':='(w=w, w_pred = as.integer(w>0.5))]
|
||||
return(df)
|
||||
}
|
||||
|
||||
schennach <- function(df){
|
||||
|
||||
fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
|
||||
df[,xstar_pred := predict(fwx, df)]
|
||||
gxt <- lm(y ~ xstar_pred+g, df)
|
||||
|
||||
}
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models")
|
||||
parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
|
||||
parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
|
||||
parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
|
||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
|
||||
args <- parse_args(parser)
|
||||
|
||||
rows <- list()
|
||||
|
||||
B0 <- 0
|
||||
Bxy <- 0.2
|
||||
Bgy <- -0.2
|
||||
Bkx <- 1
|
||||
Bgx <- 3
|
||||
|
||||
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
|
||||
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
|
||||
|
||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||
if(file.exists(args$outfile)){
|
||||
logdata <- read_feather(args$outfile)
|
||||
logdata <- rbind(logdata,as.data.table(outline))
|
||||
} else {
|
||||
logdata <- as.data.table(outline)
|
||||
}
|
||||
|
||||
print(outline)
|
||||
write_feather(logdata, args$outfile)
|
||||
unlock(outfile_lock)
|
||||
|
||||
## Ns <- c(1e6, 5e4, 1000)
|
||||
## ms <- c(100, 250, 500, 1000)
|
||||
## seeds <- 1:500
|
||||
|
||||
## rowssets <- list()
|
||||
## library(doParallel)
|
||||
## options(mc.cores = parallel::detectCores())
|
||||
## cl <- makeCluster(20)
|
||||
## registerDoParallel(cl)
|
||||
|
||||
## ## library(future)
|
||||
|
||||
## ## plan(multiprocess,workers=40,gc=TRUE)
|
||||
|
||||
## for(N in Ns){
|
||||
## print(N)
|
||||
## for(m in ms){
|
||||
## if(N>m){
|
||||
## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
|
||||
## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
|
||||
|
||||
## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
|
||||
## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
|
||||
## rowsets <- append(rowssets, list(data.table(new.rows)))
|
||||
## }
|
||||
|
||||
## }
|
||||
## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
|
||||
## }
|
||||
## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
|
||||
## packages=c('mecor','Amelia','Zelig','predictionError'),
|
||||
## seed=TRUE)))
|
||||
|
||||
|
||||
## df <- rbindlist(rowsets)
|
||||
|
||||
## write_feather(df,"example_2B_simulation.feather")
|
||||
|
||||
## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
## result <- list()
|
||||
## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
|
||||
|
||||
## result <- append(result, list(N=N,
|
||||
## m=m,
|
||||
## B0=B0,
|
||||
## Bxy=Bxy,
|
||||
## Bgy=Bgy,
|
||||
## Bkx=Bkx,
|
||||
## seed=seed))
|
||||
|
||||
## (accuracy <- df[,.(mean(w_pred==x))])
|
||||
## result <- append(result, list(accuracy=accuracy))
|
||||
|
||||
## (model.true <- lm(y ~ x + g, data=df))
|
||||
## true.ci.Bxy <- confint(model.true)['x',]
|
||||
## true.ci.Bgy <- confint(model.true)['g',]
|
||||
|
||||
## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
|
||||
## Bgy.est.true=coef(model.true)['g'],
|
||||
## Bxy.ci.upper.true = true.ci.Bxy[2],
|
||||
## Bxy.ci.lower.true = true.ci.Bxy[1],
|
||||
## Bgy.ci.upper.true = true.ci.Bgy[2],
|
||||
## Bgy.ci.lower.true = true.ci.Bgy[1]))
|
||||
|
||||
## (model.feasible <- lm(y~x.obs+g,data=df))
|
||||
|
||||
## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
|
||||
## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
|
||||
## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
|
||||
## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
|
||||
|
||||
## feasible.ci.Bgy <- confint(model.feasible)['g',]
|
||||
## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
|
||||
## Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
|
||||
## Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
|
||||
|
||||
## (model.naive <- lm(y~w+g, data=df))
|
||||
|
||||
## naive.ci.Bxy <- confint(model.naive)['w',]
|
||||
## naive.ci.Bgy <- confint(model.naive)['g',]
|
||||
|
||||
## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
|
||||
## Bgy.est.naive=coef(model.naive)['g'],
|
||||
## Bxy.ci.upper.naive = naive.ci.Bxy[2],
|
||||
## Bxy.ci.lower.naive = naive.ci.Bxy[1],
|
||||
## Bgy.ci.upper.naive = naive.ci.Bgy[2],
|
||||
## Bgy.ci.lower.naive = naive.ci.Bgy[1]))
|
||||
|
||||
|
||||
## ## multiple imputation when k is observed
|
||||
## ## amelia does great at this one.
|
||||
## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
|
||||
## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
||||
## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
||||
|
||||
## est.x.mi <- coefse['x.obs','Estimate']
|
||||
## est.x.se <- coefse['x.obs','Std.Error']
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.amelia.full = est.x.mi,
|
||||
## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||
## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
||||
## ))
|
||||
|
||||
## est.g.mi <- coefse['g','Estimate']
|
||||
## est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bgy.est.amelia.full = est.g.mi,
|
||||
## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
|
||||
## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
|
||||
## ))
|
||||
|
||||
## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
||||
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
|
||||
## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
|
||||
## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
|
||||
|
||||
## est.x.mi <- coefse['x.obs','Estimate']
|
||||
## est.x.se <- coefse['x.obs','Std.Error']
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.amelia.nok = est.x.mi,
|
||||
## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
|
||||
## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
|
||||
## ))
|
||||
|
||||
## est.g.mi <- coefse['g','Estimate']
|
||||
## est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bgy.est.amelia.nok = est.g.mi,
|
||||
## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
|
||||
## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
|
||||
## ))
|
||||
|
||||
## p <- v <- train <- rep(0,N)
|
||||
## M <- m
|
||||
## p[(M+1):(N)] <- 1
|
||||
## v[1:(M)] <- 1
|
||||
## df <- df[order(x.obs)]
|
||||
## y <- df[,y]
|
||||
## x <- df[,x.obs]
|
||||
## g <- df[,g]
|
||||
## w <- df[,w]
|
||||
## # gmm gets pretty close
|
||||
## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.gmm = gmm.res$beta[1,1],
|
||||
## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
|
||||
## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bgy.est.gmm = gmm.res$beta[2,1],
|
||||
## Bgy.ci.upper.gmm = gmm.res$confint[2,2],
|
||||
## Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
|
||||
|
||||
|
||||
## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
|
||||
## (mod.calibrated.mle)
|
||||
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
|
||||
## result <- append(result, list(
|
||||
## Bxy.est.mecor = mecor.ci['Estimate'],
|
||||
## Bxy.upper.mecor = mecor.ci['UCI'],
|
||||
## Bxy.lower.mecor = mecor.ci['LCI'])
|
||||
## )
|
||||
|
||||
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
|
||||
|
||||
## result <- append(result, list(
|
||||
## Bxy.est.mecor = mecor.ci['Estimate'],
|
||||
## Bxy.upper.mecor = mecor.ci['UCI'],
|
||||
## Bxy.lower.mecor = mecor.ci['LCI'])
|
||||
## )
|
||||
|
||||
|
||||
## return(result)
|
||||
## }
|
||||
|
BIN
simulations/example_2_B.feather
Normal file
BIN
simulations/example_2_B.feather
Normal file
Binary file not shown.
281
simulations/example_2_B_mecor.R
Normal file
281
simulations/example_2_B_mecor.R
Normal file
@ -0,0 +1,281 @@
|
||||
### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||
### This is the same as example 2, only instead of x->k we have k->x.
|
||||
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||
### Even when you include the proxy variable in the regression.
|
||||
### But with some ground truth and multiple imputation, you can fix it.
|
||||
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
|
||||
source("simulation_base_mecor.R")
|
||||
|
||||
## SETUP:
|
||||
### we want to estimate x -> y; x is MAR
|
||||
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
|
||||
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||
### The labels x are binary, but the model provides a continuous predictor
|
||||
|
||||
### simulation:
|
||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||
####
|
||||
logistic <- function(x) {1/(1+exp(-1*x))}
|
||||
|
||||
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
set.seed(seed)
|
||||
|
||||
## the true value of x
|
||||
|
||||
g <- rbinom(N, 1, 0.5)
|
||||
xprime <- Bgx * g + rnorm(N,0,1)
|
||||
|
||||
k <- Bkx*xprime + rnorm(N,0,3)
|
||||
|
||||
x <- as.integer(logistic(scale(xprime)) > 0.5)
|
||||
|
||||
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
|
||||
df <- data.table(x=x,k=k,y=y,g=g)
|
||||
|
||||
if( m < N){
|
||||
df <- df[sample(nrow(df), m), x.obs := x]
|
||||
} else {
|
||||
df <- df[, x.obs := x]
|
||||
}
|
||||
|
||||
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
|
||||
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
|
||||
## y = B0 + B1x + e
|
||||
|
||||
df[,':='(w=w, w_pred = as.integer(w>0.5))]
|
||||
return(df)
|
||||
}
|
||||
|
||||
schennach <- function(df){
|
||||
|
||||
fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
|
||||
df[,xstar_pred := predict(fwx, df)]
|
||||
gxt <- lm(y ~ xstar_pred+g, df)
|
||||
|
||||
}
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models")
|
||||
parser <- add_argument(parser, "--N", default=100, help="number of observations of w")
|
||||
parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations")
|
||||
parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng')
|
||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather')
|
||||
args <- parse_args(parser)
|
||||
|
||||
B0 <- 0
|
||||
Bxy <- 0.2
|
||||
|
||||
Bkx <- 0.5
|
||||
|
||||
rows <- list()
|
||||
|
||||
B0 <- 0
|
||||
Bxy <- 0.4
|
||||
Bgy <- 0.25
|
||||
Bkx <- 3.2
|
||||
Bgx <- -0.5
|
||||
|
||||
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
|
||||
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
|
||||
|
||||
outfile_lock <- lock(paste0(args$outfile, '_lock'))
|
||||
if(file.exists(args$outfile)){
|
||||
logdata <- read_feather(args$outfile)
|
||||
logdata <- rbind(logdata,as.data.table(outline))
|
||||
} else {
|
||||
logdata <- as.data.table(outline)
|
||||
}
|
||||
|
||||
print(outline)
|
||||
write_feather(logdata, args$outfile)
|
||||
unlock(outfile_lock)
|
||||
|
||||
## Ns <- c(1e6, 5e4, 1000)
|
||||
## ms <- c(100, 250, 500, 1000)
|
||||
## seeds <- 1:500
|
||||
|
||||
## rowssets <- list()
|
||||
## library(doParallel)
|
||||
## options(mc.cores = parallel::detectCores())
|
||||
## cl <- makeCluster(20)
|
||||
## registerDoParallel(cl)
|
||||
|
||||
## ## library(future)
|
||||
|
||||
## ## plan(multiprocess,workers=40,gc=TRUE)
|
||||
|
||||
## for(N in Ns){
|
||||
## print(N)
|
||||
## for(m in ms){
|
||||
## if(N>m){
|
||||
## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'),
|
||||
## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar%
|
||||
|
||||
## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter)
|
||||
## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))}
|
||||
## rowsets <- append(rowssets, list(data.table(new.rows)))
|
||||
## }
|
||||
|
||||
## }
|
||||
## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
|
||||
## }
|
||||
## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w},
|
||||
## packages=c('mecor','Amelia','Zelig','predictionError'),
|
||||
## seed=TRUE)))
|
||||
|
||||
|
||||
## df <- rbindlist(rowsets)
|
||||
|
||||
## write_feather(df,"example_2B_simulation.feather")
|
||||
|
||||
## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
## result <- list()
|
||||
## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
|
||||
|
||||
## result <- append(result, list(N=N,
|
||||
## m=m,
|
||||
## B0=B0,
|
||||
## Bxy=Bxy,
|
||||
## Bgy=Bgy,
|
||||
## Bkx=Bkx,
|
||||
## seed=seed))
|
||||
|
||||
## (accuracy <- df[,.(mean(w_pred==x))])
|
||||
## result <- append(result, list(accuracy=accuracy))
|
||||
|
||||
## (model.true <- lm(y ~ x + g, data=df))
|
||||
## true.ci.Bxy <- confint(model.true)['x',]
|
||||
## true.ci.Bgy <- confint(model.true)['g',]
|
||||
|
||||
## result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
|
||||
## Bgy.est.true=coef(model.true)['g'],
|
||||
## Bxy.ci.upper.true = true.ci.Bxy[2],
|
||||
## Bxy.ci.lower.true = true.ci.Bxy[1],
|
||||
## Bgy.ci.upper.true = true.ci.Bgy[2],
|
||||
## Bgy.ci.lower.true = true.ci.Bgy[1]))
|
||||
|
||||
## (model.feasible <- lm(y~x.obs+g,data=df))
|
||||
|
||||
## feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
|
||||
## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
|
||||
## Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
|
||||
## Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
|
||||
|
||||
## feasible.ci.Bgy <- confint(model.feasible)['g',]
|
||||
## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
|
||||
## Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
|
||||
## Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
|
||||
|
||||
## (model.naive <- lm(y~w+g, data=df))
|
||||
|
||||
## naive.ci.Bxy <- confint(model.naive)['w',]
|
||||
## naive.ci.Bgy <- confint(model.naive)['g',]
|
||||
|
||||
## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
|
||||
## Bgy.est.naive=coef(model.naive)['g'],
|
||||
## Bxy.ci.upper.naive = naive.ci.Bxy[2],
|
||||
## Bxy.ci.lower.naive = naive.ci.Bxy[1],
|
||||
## Bgy.ci.upper.naive = naive.ci.Bgy[2],
|
||||
## Bgy.ci.lower.naive = naive.ci.Bgy[1]))
|
||||
|
||||
|
||||
## ## multiple imputation when k is observed
|
||||
## ## amelia does great at this one.
|
||||
## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w'))
|
||||
## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
||||
## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
||||
|
||||
## est.x.mi <- coefse['x.obs','Estimate']
|
||||
## est.x.se <- coefse['x.obs','Std.Error']
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.amelia.full = est.x.mi,
|
||||
## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||
## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
||||
## ))
|
||||
|
||||
## est.g.mi <- coefse['g','Estimate']
|
||||
## est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bgy.est.amelia.full = est.g.mi,
|
||||
## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
|
||||
## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
|
||||
## ))
|
||||
|
||||
## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
||||
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w"))
|
||||
## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
|
||||
## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
|
||||
|
||||
## est.x.mi <- coefse['x.obs','Estimate']
|
||||
## est.x.se <- coefse['x.obs','Std.Error']
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.amelia.nok = est.x.mi,
|
||||
## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
|
||||
## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
|
||||
## ))
|
||||
|
||||
## est.g.mi <- coefse['g','Estimate']
|
||||
## est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bgy.est.amelia.nok = est.g.mi,
|
||||
## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
|
||||
## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
|
||||
## ))
|
||||
|
||||
## p <- v <- train <- rep(0,N)
|
||||
## M <- m
|
||||
## p[(M+1):(N)] <- 1
|
||||
## v[1:(M)] <- 1
|
||||
## df <- df[order(x.obs)]
|
||||
## y <- df[,y]
|
||||
## x <- df[,x.obs]
|
||||
## g <- df[,g]
|
||||
## w <- df[,w]
|
||||
## # gmm gets pretty close
|
||||
## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE))
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bxy.est.gmm = gmm.res$beta[1,1],
|
||||
## Bxy.ci.upper.gmm = gmm.res$confint[1,2],
|
||||
## Bxy.ci.lower.gmm = gmm.res$confint[1,1]))
|
||||
|
||||
## result <- append(result,
|
||||
## list(Bgy.est.gmm = gmm.res$beta[2,1],
|
||||
## Bgy.ci.upper.gmm = gmm.res$confint[2,2],
|
||||
## Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
|
||||
|
||||
|
||||
## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
|
||||
## (mod.calibrated.mle)
|
||||
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
|
||||
## result <- append(result, list(
|
||||
## Bxy.est.mecor = mecor.ci['Estimate'],
|
||||
## Bxy.upper.mecor = mecor.ci['UCI'],
|
||||
## Bxy.lower.mecor = mecor.ci['LCI'])
|
||||
## )
|
||||
|
||||
## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
|
||||
|
||||
## result <- append(result, list(
|
||||
## Bxy.est.mecor = mecor.ci['Estimate'],
|
||||
## Bxy.upper.mecor = mecor.ci['UCI'],
|
||||
## Bxy.lower.mecor = mecor.ci['LCI'])
|
||||
## )
|
||||
|
||||
|
||||
## return(result)
|
||||
## }
|
||||
|
187
simulations/example_2_continuous.R
Normal file
187
simulations/example_2_continuous.R
Normal file
@ -0,0 +1,187 @@
|
||||
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||
### Even when you include the proxy variable in the regression.
|
||||
### But with some ground truth and multiple imputation, you can fix it.
|
||||
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
options(amelia.parallel="multicore",
|
||||
amelia.ncpus=40)
|
||||
|
||||
## SETUP:
|
||||
### we want to estimate g -> y and x -> y; g is observed, x is MAR
|
||||
### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
|
||||
### we have k -> w; x -> w; w is observed.
|
||||
### for illustration, g is binary (e.g., gender==male).
|
||||
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||
### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
|
||||
### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
|
||||
### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
|
||||
### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
|
||||
### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race.
|
||||
|
||||
### simulation:
|
||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||
####
|
||||
logistic <- function(x) {1/(1+exp(-1*x))}
|
||||
|
||||
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
set.seed(seed)
|
||||
|
||||
## the true value of x
|
||||
|
||||
g <- rbinom(N, 1, 0.5)
|
||||
k <- rnorm(N, 0, 1)
|
||||
x <- Bkx*k + Bgx * g + rnorm(N,0,1)
|
||||
w.model <- lm(x ~ k)
|
||||
w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
|
||||
## y = B0 + B1x + e
|
||||
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
|
||||
df <- data.table(x=x,k=k,y=y,w=w,g=g)
|
||||
if( m < N){
|
||||
df <- df[sample(nrow(df), m), x.obs := x]
|
||||
} else {
|
||||
df <- df[, x.obs := x]
|
||||
}
|
||||
|
||||
return(df)
|
||||
}
|
||||
|
||||
|
||||
run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
result <- list()
|
||||
df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
|
||||
|
||||
result <- append(result, list(N=N,
|
||||
m=m,
|
||||
B0=B0,
|
||||
Bxy=Bxy,
|
||||
Bgy=Bgy,
|
||||
Bkx=Bkx,
|
||||
seed=seed))
|
||||
|
||||
correlation <- cor(df$w,df$x)
|
||||
result <- append(result, list(correlation=correlation))
|
||||
|
||||
model.true <- lm(y ~ x + g, data=df)
|
||||
true.ci.Bxy <- confint(model.true)['x',]
|
||||
true.ci.Bgy <- confint(model.true)['g',]
|
||||
|
||||
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
|
||||
Bgy.est.true=coef(model.true)['g'],
|
||||
Bxy.ci.upper.true = true.ci.Bxy[2],
|
||||
Bxy.ci.lower.true = true.ci.Bxy[1],
|
||||
Bgy.ci.upper.true = true.ci.Bgy[2],
|
||||
Bgy.ci.lower.true = true.ci.Bgy[1]))
|
||||
|
||||
|
||||
model.naive <- lm(y~w+g, data=df)
|
||||
|
||||
naive.ci.Bxy <- confint(model.naive)['w',]
|
||||
naive.ci.Bgy <- confint(model.naive)['g',]
|
||||
|
||||
result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
|
||||
Bgy.est.naive=coef(model.naive)['g'],
|
||||
Bxy.ci.upper.naive = naive.ci.Bxy[2],
|
||||
Bxy.ci.lower.naive = naive.ci.Bxy[1],
|
||||
Bgy.ci.upper.naive = naive.ci.Bgy[2],
|
||||
Bgy.ci.lower.naive = naive.ci.Bgy[1]))
|
||||
|
||||
|
||||
## multiple imputation when k is observed
|
||||
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'))
|
||||
mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
||||
coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
|
||||
|
||||
est.x.mi <- coefse['x.obs','Estimate']
|
||||
est.x.se <- coefse['x.obs','Std.Error']
|
||||
result <- append(result,
|
||||
list(Bxy.est.amelia.full = est.x.mi,
|
||||
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
||||
))
|
||||
|
||||
est.g.mi <- coefse['g','Estimate']
|
||||
est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
result <- append(result,
|
||||
list(Bgy.est.amelia.full = est.g.mi,
|
||||
Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
|
||||
Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
|
||||
))
|
||||
|
||||
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
||||
amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"))
|
||||
mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
|
||||
coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
|
||||
|
||||
est.x.mi <- coefse['x.obs','Estimate']
|
||||
est.x.se <- coefse['x.obs','Std.Error']
|
||||
result <- append(result,
|
||||
list(Bxy.est.amelia.nok = est.x.mi,
|
||||
Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
|
||||
Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
|
||||
))
|
||||
|
||||
est.g.mi <- coefse['g','Estimate']
|
||||
est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
result <- append(result,
|
||||
list(Bgy.est.amelia.nok = est.g.mi,
|
||||
Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
|
||||
Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
|
||||
))
|
||||
|
||||
p <- v <- train <- rep(0,N)
|
||||
M <- m
|
||||
p[(M+1):(N)] <- 1
|
||||
train[0:(M/2)] <- 1
|
||||
v[(M/2+1):(M)] <- 1
|
||||
df <- df[order(x.obs)]
|
||||
y <- df[,y]
|
||||
x <- df[,x.obs]
|
||||
g <- df[,g]
|
||||
w <- df[,w]
|
||||
gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)
|
||||
result <- append(result,
|
||||
list(Bxy.est.gmm = gmm.res$beta[1,1],
|
||||
Bxy.ci.upper.gmm = gmm.res$confint[1,2],
|
||||
Bxy.ci.lower.gmm = gmm.res$confint[1,1],
|
||||
Bgy.est.gmm = gmm.res$beta[2,1],
|
||||
Bgy.ci.upper.gmm = gmm.res$confint[2,2],
|
||||
Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
|
||||
|
||||
return(result)
|
||||
}
|
||||
|
||||
Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
|
||||
ms <- c(30, 50, 100, 200, 300, 500)
|
||||
B0 <- 0
|
||||
Bxy <- 1
|
||||
Bgy <- 0.3
|
||||
Bkx <- 3
|
||||
Bgx <- -4
|
||||
seeds <- 1:100
|
||||
|
||||
rows <- list()
|
||||
|
||||
for(N in Ns){
|
||||
print(N)
|
||||
for(m in ms){
|
||||
if(N>m){
|
||||
for(seed in seeds){
|
||||
rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result <- rbindlist(rows)
|
||||
write_feather(result, "example_2_simulation_continuous.feather")
|
144
simulations/example_3.R
Normal file
144
simulations/example_3.R
Normal file
@ -0,0 +1,144 @@
|
||||
### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||
### What kind of data invalidates fong + tyler?
|
||||
### This is the same as example 2, only instead of x->k we have k->x.
|
||||
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||
### Even when you include the proxy variable in the regression.
|
||||
### But with some ground truth and multiple imputation, you can fix it.
|
||||
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
setDTthreads(40)
|
||||
|
||||
source("simulation_base.R")
|
||||
|
||||
## SETUP:
|
||||
### we want to estimate x -> y; x is MAR
|
||||
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
|
||||
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||
### The labels x are binary, but the model provides a continuous predictor
|
||||
|
||||
### simulation:
|
||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||
####
|
||||
|
||||
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
||||
## in other words, the model is missing an important feature of x.obs that's related to y.
|
||||
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
set.seed(seed)
|
||||
|
||||
## the true value of x
|
||||
|
||||
g <- rbinom(N, 1, 0.5)
|
||||
|
||||
# make w and y dependent
|
||||
u <- rnorm(N,0,Bxy)
|
||||
|
||||
xprime <- Bgx * g + rnorm(N,0,1)
|
||||
|
||||
k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u
|
||||
|
||||
x <- as.integer(logistic(scale(xprime)) > 0.5)
|
||||
|
||||
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
|
||||
|
||||
df <- data.table(x=x,k=k,y=y,g=g)
|
||||
|
||||
w.model <- glm(x ~ k,df, family=binomial(link='logit'))
|
||||
|
||||
if( m < N){
|
||||
df <- df[sample(nrow(df), m), x.obs := x]
|
||||
} else {
|
||||
df <- df[, x.obs := x]
|
||||
}
|
||||
|
||||
df[, x.obs := x.obs]
|
||||
|
||||
w <- predict(w.model, df) + rnorm(N, 0, 1)
|
||||
## y = B0 + B1x + e
|
||||
|
||||
df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
|
||||
return(df)
|
||||
}
|
||||
|
||||
## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
|
||||
## set.seed(seed)
|
||||
|
||||
## ## the true value of x
|
||||
|
||||
## g <- rbinom(N, 1, 0.5)
|
||||
|
||||
## # make w and y dependent
|
||||
## u <- rnorm(N,0,5)
|
||||
|
||||
## xprime <- Bgx * g + rnorm(N,0,1)
|
||||
|
||||
## k <- Bkx*xprime + rnorm(N,0,3)
|
||||
|
||||
## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5)
|
||||
|
||||
## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u
|
||||
|
||||
## df <- data.table(x=x,k=k,y=y,g=g)
|
||||
|
||||
## w.model <- glm(x ~ k, df, family=binomial(link='logit'))
|
||||
|
||||
## if( m < N){
|
||||
## df <- df[sample(nrow(df), m), x.obs := x]
|
||||
## } else {
|
||||
## df <- df[, x.obs := x]
|
||||
## }
|
||||
|
||||
## w <- predict(w.model,data.frame(k=k)) + u
|
||||
## ## y = B0 + B1x + e
|
||||
|
||||
## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)]
|
||||
## return(df)
|
||||
## }
|
||||
|
||||
|
||||
schennach <- function(df){
|
||||
|
||||
fwx <- glm(x.obs~w, df, family=binomial(link='logit'))
|
||||
df[,xstar_pred := predict(fwx, df)]
|
||||
gxt <- lm(y ~ xstar_pred+g, df)
|
||||
|
||||
}
|
||||
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models")
|
||||
parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
|
||||
parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations")
|
||||
parser <- add_argument(parser, "--seed", default=432, help='seed for the rng')
|
||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
|
||||
args <- parse_args(parser)
|
||||
|
||||
B0 <- 0
|
||||
Bxy <- 0.2
|
||||
Bgy <- 0
|
||||
Bkx <- 2
|
||||
Bgx <- 0
|
||||
|
||||
|
||||
outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed)
|
||||
,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed))
|
||||
|
||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||
if(file.exists(args$outfile)){
|
||||
logdata <- read_feather(args$outfile)
|
||||
logdata <- rbind(logdata,as.data.table(outline))
|
||||
} else {
|
||||
logdata <- as.data.table(outline)
|
||||
}
|
||||
|
||||
print(outline)
|
||||
write_feather(logdata, args$outfile)
|
||||
unlock(outfile_lock)
|
BIN
simulations/example_3.feather
Normal file
BIN
simulations/example_3.feather
Normal file
Binary file not shown.
175
simulations/plot_example_1.R
Normal file
175
simulations/plot_example_1.R
Normal file
@ -0,0 +1,175 @@
|
||||
source("RemembR/R/RemembeR.R")
|
||||
library(arrow)
|
||||
library(data.table)
|
||||
library(ggplot2)
|
||||
|
||||
df <- data.table(read_feather("example_1.feather"))
|
||||
|
||||
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
|
||||
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
|
||||
bias = Bxy - Bxy.est.naive,
|
||||
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
|
||||
|
||||
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.estimate=mean(Bxy.est.naive),
|
||||
var.estimate=var(Bxy.est.naive),
|
||||
Bxy=mean(Bxy),
|
||||
variable='x',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
|
||||
|
||||
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
|
||||
bias = Bxy.est.true - Bxy.est.amelia.full,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
|
||||
|
||||
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
|
||||
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
|
||||
bias = Bgy.est.amelia.full - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
|
||||
|
||||
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
|
||||
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
|
||||
bias = Bxy.est.amelia.nok - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
|
||||
|
||||
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
|
||||
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
|
||||
bias = Bgy.est.amelia.nok - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
|
||||
|
||||
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
|
||||
|
||||
x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
|
||||
zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
|
||||
bias = Bxy.est.mecor - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
|
||||
|
||||
x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Regression Calibration'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
|
||||
|
||||
g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
|
||||
zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
|
||||
bias = Bgy.est.mecor - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
|
||||
|
||||
g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Regression Calibration'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
|
||||
|
||||
## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
|
||||
## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
|
||||
## bias = Bgy.est.mecor - Bgy.est.true,
|
||||
## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
|
||||
|
||||
## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
## mean.bias = mean(bias),
|
||||
## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
## variable='g',
|
||||
## method='Regression Calibration'
|
||||
## ),
|
||||
## by=c('N','m')]
|
||||
|
||||
|
||||
x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
|
||||
x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
|
||||
bias = Bxy.est.gmm - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
|
||||
|
||||
x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='2SLS+gmm'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
|
||||
g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
|
||||
bias = Bgy.est.gmm - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
|
||||
|
||||
g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='2SLS+gmm'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot))
|
||||
|
||||
remember(plot.df,'example.1.plot.df')
|
||||
|
||||
|
||||
ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
||||
|
||||
ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
||||
|
||||
ests <- df[,.(Bxy.est.true = mean(Bxy.est.true),
|
||||
Bxy.est.naive = mean(Bxy.est.naive),
|
||||
Bxy.est.feasible = mean(Bxy.est.feasible),
|
||||
Bxy.est.amelia.full = mean(Bxy.est.amelia.full),
|
||||
Bxy.est.amelia.nok = mean(Bxy.est.amelia.nok),
|
||||
Bxy.est.mecor = mean(Bxy.est.mecor),
|
||||
Bxy.est.gmm = mean(Bxy.est.gmm)),
|
||||
by=c("N","m")]
|
102
simulations/plot_example_2.R
Normal file
102
simulations/plot_example_2.R
Normal file
@ -0,0 +1,102 @@
|
||||
library(arrow)
|
||||
library(data.table)
|
||||
library(ggplot2)
|
||||
|
||||
df <- data.table(read_feather("example_2_simulation.feather"))
|
||||
|
||||
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
|
||||
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
|
||||
bias = Bxy - Bxy.est.naive,
|
||||
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
|
||||
|
||||
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
|
||||
g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
|
||||
bias = Bgy - Bgy.est.naive,
|
||||
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
|
||||
|
||||
g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
|
||||
bias = Bxy.est.true - Bxy.est.amelia.full,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
|
||||
|
||||
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
|
||||
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
|
||||
bias = Bgy.est.amelia.full - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
|
||||
|
||||
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
|
||||
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
|
||||
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
|
||||
bias = Bxy.est.amelia.nok - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
|
||||
|
||||
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
|
||||
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
|
||||
bias = Bgy.est.amelia.nok - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
|
||||
|
||||
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
|
||||
|
||||
ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
||||
|
||||
kggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
277
simulations/plot_example_2_B.R
Normal file
277
simulations/plot_example_2_B.R
Normal file
@ -0,0 +1,277 @@
|
||||
source("RemembR/R/RemembeR.R")
|
||||
library(arrow)
|
||||
library(data.table)
|
||||
library(ggplot2)
|
||||
library(filelock)
|
||||
library(argparse)
|
||||
|
||||
l <- filelock::lock("example_2_B.feather_lock",exclusive=FALSE)
|
||||
df <- data.table(read_feather("example_2_B.feather"))
|
||||
filelock::unlock(l)
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models.")
|
||||
parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
|
||||
parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
|
||||
args <- parse_args(parser)
|
||||
|
||||
build_plot_dataset <- function(df){
|
||||
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
|
||||
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
|
||||
bias = Bxy - Bxy.est.naive,
|
||||
Bxy.est.naive = Bxy.est.naive,
|
||||
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
|
||||
|
||||
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
mean.est = mean(Bxy.est.naive),
|
||||
var.est = var(Bxy.est.naive),
|
||||
N.sims = .N,
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
|
||||
g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
|
||||
bias = Bgy - Bgy.est.naive,
|
||||
Bgy.est.naive = Bgy.est.naive,
|
||||
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
|
||||
|
||||
g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
mean.est = mean(Bgy.est.naive),
|
||||
var.est = var(Bgy.est.naive),
|
||||
N.sims = .N,
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
|
||||
x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
|
||||
bias = Bxy - Bxy.est.feasible,
|
||||
Bxy.est.feasible = Bxy.est.feasible,
|
||||
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
|
||||
|
||||
x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
mean.est = mean(Bxy.est.feasible),
|
||||
var.est = var(Bxy.est.feasible),
|
||||
N.sims = .N,
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Feasible'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
|
||||
g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
|
||||
bias = Bgy - Bgy.est.feasible,
|
||||
Bgy.est.feasible = Bgy.est.feasible,
|
||||
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
|
||||
|
||||
g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
mean.est = mean(Bgy.est.feasible),
|
||||
var.est = var(Bgy.est.feasible),
|
||||
N.sims = .N,
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Feasible'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
|
||||
|
||||
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
|
||||
bias = Bxy.est.true - Bxy.est.amelia.full,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
|
||||
|
||||
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bxy.est.amelia.full),
|
||||
var.est = var(Bxy.est.amelia.full),
|
||||
N.sims = .N,
|
||||
variable='x',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
|
||||
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
|
||||
bias = Bgy.est.amelia.full - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
|
||||
|
||||
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bgy.est.amelia.full),
|
||||
var.est = var(Bgy.est.amelia.full),
|
||||
N.sims = .N,
|
||||
variable='g',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
|
||||
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
|
||||
bias = Bxy.est.amelia.nok - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
|
||||
|
||||
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bxy.est.amelia.nok),
|
||||
var.est = var(Bxy.est.amelia.nok),
|
||||
N.sims = .N,
|
||||
variable='x',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
|
||||
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
|
||||
bias = Bgy.est.amelia.nok - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
|
||||
|
||||
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bgy.est.amelia.nok),
|
||||
var.est = var(Bgy.est.amelia.nok),
|
||||
N.sims = .N,
|
||||
variable='g',
|
||||
method="Multiple imputation (Classifier features unobserved)"
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
|
||||
|
||||
x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
|
||||
zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
|
||||
bias = Bxy.est.mecor - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
|
||||
|
||||
x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bxy.est.mecor),
|
||||
var.est = var(Bxy.est.mecor),
|
||||
N.sims = .N,
|
||||
variable='x',
|
||||
method='Regression Calibration'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
|
||||
|
||||
g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
|
||||
zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
|
||||
bias = Bgy.est.mecor - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
|
||||
|
||||
g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bgy.est.mecor),
|
||||
var.est = var(Bgy.est.mecor),
|
||||
N.sims = .N,
|
||||
variable='g',
|
||||
method='Regression Calibration'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
|
||||
|
||||
## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
|
||||
## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
|
||||
## bias = Bgy.est.mecor - Bgy.est.true,
|
||||
## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
|
||||
|
||||
## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
## mean.bias = mean(bias),
|
||||
## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
## variable='g',
|
||||
## method='Regression Calibration'
|
||||
## ),
|
||||
## by=c('N','m')]
|
||||
|
||||
|
||||
x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
|
||||
x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
|
||||
bias = Bxy.est.gmm - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
|
||||
|
||||
x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bxy.est.gmm),
|
||||
var.est = var(Bxy.est.gmm),
|
||||
N.sims = .N,
|
||||
variable='x',
|
||||
method='2SLS+gmm'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
|
||||
g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
|
||||
bias = Bgy.est.gmm - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
|
||||
|
||||
g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
mean.est = mean(Bgy.est.gmm),
|
||||
var.est = var(Bgy.est.gmm),
|
||||
N.sims = .N,
|
||||
variable='g',
|
||||
method='2SLS+gmm'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
accuracy <- df[,mean(accuracy)]
|
||||
return(plot.df)
|
||||
}
|
||||
|
||||
df <- read_feather(args$infile)
|
||||
plot.df <- build_plot_dataset(df)
|
||||
remember(plot.df,args$name))
|
||||
|
||||
|
||||
## df[gmm.ER_pval<0.05]
|
||||
|
||||
|
||||
|
||||
## plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T)
|
||||
|
||||
## plot.df[,accuracy := accuracy]
|
||||
|
||||
## # plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
|
||||
|
||||
|
||||
|
||||
## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
|
||||
|
||||
## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
||||
|
||||
## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
102
simulations/plot_example_2_continuous.R
Normal file
102
simulations/plot_example_2_continuous.R
Normal file
@ -0,0 +1,102 @@
|
||||
library(arrow)
|
||||
library(data.table)
|
||||
library(ggplot2)
|
||||
|
||||
df <- data.table(read_feather("example_2_simulation.feather"))
|
||||
|
||||
x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
|
||||
x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
|
||||
bias = Bxy - Bxy.est.naive,
|
||||
sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
|
||||
|
||||
x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
|
||||
g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
|
||||
bias = Bgy - Bgy.est.naive,
|
||||
sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
|
||||
|
||||
g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Naive'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
|
||||
bias = Bxy.est.true - Bxy.est.amelia.full,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
|
||||
|
||||
x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
|
||||
g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
|
||||
bias = Bgy.est.amelia.full - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
|
||||
|
||||
g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Multiple imputation'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
|
||||
|
||||
x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
|
||||
x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
|
||||
bias = Bxy.est.amelia.nok - Bxy.est.true,
|
||||
sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
|
||||
|
||||
x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='x',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
|
||||
g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
|
||||
zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
|
||||
bias = Bgy.est.amelia.nok - Bgy.est.true,
|
||||
sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
|
||||
|
||||
g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
|
||||
mean.bias = mean(bias),
|
||||
p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||
variable='g',
|
||||
method='Multiple imputation (Classifier features unobserved)'
|
||||
),
|
||||
by=c('N','m')]
|
||||
|
||||
|
||||
plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot))
|
||||
|
||||
ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
||||
|
||||
ggplot(plot.df,aes(y=N,x=m,color=mean.bias)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size")
|
155
simulations/simulation_base.R
Normal file
155
simulations/simulation_base.R
Normal file
@ -0,0 +1,155 @@
|
||||
library(predictionError)
|
||||
library(mecor)
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
|
||||
logistic <- function(x) {1/(1+exp(-1*x))}
|
||||
|
||||
run_simulation <- function(df, result){
|
||||
|
||||
accuracy <- df[,mean(w_pred==x)]
|
||||
result <- append(result, list(accuracy=accuracy))
|
||||
|
||||
(model.true <- lm(y ~ x + g, data=df))
|
||||
true.ci.Bxy <- confint(model.true)['x',]
|
||||
true.ci.Bgy <- confint(model.true)['g',]
|
||||
|
||||
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
|
||||
Bgy.est.true=coef(model.true)['g'],
|
||||
Bxy.ci.upper.true = true.ci.Bxy[2],
|
||||
Bxy.ci.lower.true = true.ci.Bxy[1],
|
||||
Bgy.ci.upper.true = true.ci.Bgy[2],
|
||||
Bgy.ci.lower.true = true.ci.Bgy[1]))
|
||||
|
||||
(model.feasible <- lm(y~x.obs+g,data=df))
|
||||
|
||||
feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
|
||||
result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
|
||||
Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
|
||||
Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
|
||||
|
||||
feasible.ci.Bgy <- confint(model.feasible)['g',]
|
||||
result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
|
||||
Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
|
||||
Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
|
||||
|
||||
(model.naive <- lm(y~w+g, data=df))
|
||||
|
||||
naive.ci.Bxy <- confint(model.naive)['w',]
|
||||
naive.ci.Bgy <- confint(model.naive)['g',]
|
||||
|
||||
result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
|
||||
Bgy.est.naive=coef(model.naive)['g'],
|
||||
Bxy.ci.upper.naive = naive.ci.Bxy[2],
|
||||
Bxy.ci.lower.naive = naive.ci.Bxy[1],
|
||||
Bgy.ci.upper.naive = naive.ci.Bgy[2],
|
||||
Bgy.ci.lower.naive = naive.ci.Bgy[1]))
|
||||
|
||||
|
||||
## multiple imputation when k is observed
|
||||
## amelia does great at this one.
|
||||
noms <- c()
|
||||
if(length(unique(df$x.obs)) <=2){
|
||||
noms <- c(noms, 'x.obs')
|
||||
}
|
||||
|
||||
if(length(unique(df$g)) <=2){
|
||||
noms <- c(noms, 'g')
|
||||
}
|
||||
|
||||
|
||||
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=noms)
|
||||
mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
||||
(coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
||||
|
||||
est.x.mi <- coefse['x.obs','Estimate']
|
||||
est.x.se <- coefse['x.obs','Std.Error']
|
||||
result <- append(result,
|
||||
list(Bxy.est.amelia.full = est.x.mi,
|
||||
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
||||
))
|
||||
|
||||
est.g.mi <- coefse['g','Estimate']
|
||||
est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
result <- append(result,
|
||||
list(Bgy.est.amelia.full = est.g.mi,
|
||||
Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
|
||||
Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
|
||||
))
|
||||
|
||||
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
||||
amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"), noms=noms)
|
||||
mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
|
||||
(coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
|
||||
|
||||
est.x.mi <- coefse['x.obs','Estimate']
|
||||
est.x.se <- coefse['x.obs','Std.Error']
|
||||
result <- append(result,
|
||||
list(Bxy.est.amelia.nok = est.x.mi,
|
||||
Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
|
||||
Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
|
||||
))
|
||||
|
||||
est.g.mi <- coefse['g','Estimate']
|
||||
est.g.se <- coefse['g','Std.Error']
|
||||
|
||||
result <- append(result,
|
||||
list(Bgy.est.amelia.nok = est.g.mi,
|
||||
Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
|
||||
Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
|
||||
))
|
||||
|
||||
N <- nrow(df)
|
||||
m <- nrow(df[!is.na(x.obs)])
|
||||
p <- v <- train <- rep(0,N)
|
||||
M <- m
|
||||
p[(M+1):(N)] <- 1
|
||||
v[1:(M)] <- 1
|
||||
df <- df[order(x.obs)]
|
||||
y <- df[,y]
|
||||
x <- df[,x.obs]
|
||||
g <- df[,g]
|
||||
w <- df[,w]
|
||||
# gmm gets pretty close
|
||||
(gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE))
|
||||
|
||||
result <- append(result,
|
||||
list(Bxy.est.gmm = gmm.res$beta[1,1],
|
||||
Bxy.ci.upper.gmm = gmm.res$confint[1,2],
|
||||
Bxy.ci.lower.gmm = gmm.res$confint[1,1],
|
||||
gmm.ER_pval = gmm.res$ER_pval
|
||||
))
|
||||
|
||||
result <- append(result,
|
||||
list(Bgy.est.gmm = gmm.res$beta[2,1],
|
||||
Bgy.ci.upper.gmm = gmm.res$confint[2,2],
|
||||
Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
|
||||
|
||||
|
||||
mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
|
||||
(mod.calibrated.mle)
|
||||
(mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
|
||||
result <- append(result, list(
|
||||
Bxy.est.mecor = mecor.ci['Estimate'],
|
||||
Bxy.upper.mecor = mecor.ci['UCI'],
|
||||
Bxy.lower.mecor = mecor.ci['LCI'])
|
||||
)
|
||||
|
||||
(mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
|
||||
|
||||
result <- append(result, list(
|
||||
Bgy.est.mecor = mecor.ci['Estimate'],
|
||||
Bgy.upper.mecor = mecor.ci['UCI'],
|
||||
Bgy.lower.mecor = mecor.ci['LCI'])
|
||||
)
|
||||
|
||||
## clean up memory
|
||||
rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))
|
||||
|
||||
gc()
|
||||
return(result)
|
||||
}
|
Loading…
Reference in New Issue
Block a user