add simulation to test multiple IV.
This commit is contained in:
parent
33d758c2dd
commit
d9bc8f8f61
132
multiple_iv_simulations/01_indep_differential.R
Normal file
132
multiple_iv_simulations/01_indep_differential.R
Normal file
@ -0,0 +1,132 @@
|
||||
library(argparser)
|
||||
library(mecor)
|
||||
library(ggplot2)
|
||||
library(data.table)
|
||||
library(filelock)
|
||||
library(arrow)
|
||||
library(Amelia)
|
||||
library(Zelig)
|
||||
library(predictionError)
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
setDTthreads(40)
|
||||
|
||||
source("simulation_base.R")
|
||||
|
||||
simulate_data <- function(N,
|
||||
m,
|
||||
B0,
|
||||
Bxy,
|
||||
Bzx,
|
||||
Bzy,
|
||||
seed,
|
||||
y_explained_variance = 0.025,
|
||||
prediction_accuracy = 0.73,
|
||||
y_bias = -0.8,
|
||||
z_bias = 0,
|
||||
x_bias = 0,
|
||||
Px = 0.5,
|
||||
Pz = 0.5,
|
||||
accuracy_imbalance_difference = 0.3) {
|
||||
set.seed(seed)
|
||||
# make w and y dependent
|
||||
z <- rbinom(N, 1, plogis(qlogis(Pz)))
|
||||
x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
|
||||
|
||||
y.var.epsilon <- (var(Bzy * z) + var(Bxy * x) + 2 * cov(Bzy * z, Bxy * x)) * ((1 - y_explained_variance) / y_explained_variance)
|
||||
y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
|
||||
y <- Bzy * z + Bxy * x + y.epsilon
|
||||
|
||||
df <- data.table(x = x, y = y, z = z)
|
||||
|
||||
if (m < N) {
|
||||
df <- df[sample(nrow(df), m), ":="(x.obs = x, z.obs = z)]
|
||||
} else {
|
||||
df <- df[, ":="(x.obs = x, z.obs = z)]
|
||||
}
|
||||
|
||||
resids <- resid(lm(y ~ x + z))
|
||||
odds.x1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[x == 1])) + z_bias * qlogis(pnorm(z[x == 1], sd(z)))
|
||||
odds.x0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[x == 0])) + z_bias * qlogis(pnorm(z[x == 0], sd(z)))
|
||||
|
||||
## acc.x0 <- p.correct[df[,x==0]]
|
||||
## acc.x1 <- p.correct[df[,x==1]]
|
||||
|
||||
df[x == 0, w := plogis(rlogis(.N, odds.x0))]
|
||||
df[x == 1, w := plogis(rlogis(.N, odds.x1))]
|
||||
df[, w_pred := as.integer(w > 0.5)]
|
||||
|
||||
## now let's use another classifier for z
|
||||
odds.z1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[z == 1])) + x_bias * qlogis(pnorm(x[z == 1], sd(x)))
|
||||
odds.z0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[z == 0])) + x_bias * qlogis(pnorm(x[z == 0], sd(z)))
|
||||
|
||||
df[z == 0, a := plogis(rlogis(.N, odds.z0))]
|
||||
df[z == 1, a := plogis(rlogis(.N, odds.z1))]
|
||||
df[, a_pred := as.integer(a > 0.5)]
|
||||
|
||||
|
||||
print(mean(df$w_pred == df$x))
|
||||
print(mean(df$w_pred == df$x))
|
||||
print(mean(df[y >= 0]$w_pred == df[y >= 0]$x))
|
||||
print(mean(df[y <= 0]$w_pred == df[y <= 0]$x))
|
||||
return(df)
|
||||
}
|
||||
|
||||
parser <- arg_parser("Simulate data and fit corrected models with two independent variables that are classified")
|
||||
parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
|
||||
parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
|
||||
parser <- add_argument(parser, "--seed", default=51, help='seed for the rng')
|
||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
|
||||
parser <- add_argument(parser, "--y_explained_variance", help='what proportion of the variance of y can be explained?', default=0.1)
|
||||
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75)
|
||||
parser <- add_argument(parser, "--accuracy_imbalance_difference", help='how much more accurate is the predictive model for one class than the other?', default=0.3)
|
||||
parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
|
||||
parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
|
||||
parser <- add_argument(parser, "--Bxy", help='Effect of z on y', default=0.3)
|
||||
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
||||
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*z*x")
|
||||
parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
|
||||
parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
|
||||
parser <- add_argument(parser, "--x_bias", help='coefficient of x on the probability a classification is correct', default=0)
|
||||
parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
|
||||
parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
|
||||
parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
|
||||
args <- parse_args(parser)
|
||||
|
||||
B0 <- 0
|
||||
Px <- args$Px
|
||||
Bxy <- args$Bxy
|
||||
Bzy <- args$Bzy
|
||||
Bzx <- args$Bzx
|
||||
|
||||
if(args$m < args$N){
|
||||
|
||||
df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$y_explained_variance, args$prediction_accuracy, y_bias=args$y_bias)
|
||||
|
||||
## df.pc <- df[,.(x,y,z,w_pred,w)]
|
||||
## # df.pc <- df.pc[,err:=x-w_pred]
|
||||
## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
|
||||
## plot(pc.df)
|
||||
|
||||
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='')
|
||||
|
||||
research_data <- df[is.na(x.obs), .(w = w_pred, a = a_pred, y = y)] |> as.data.frame()
|
||||
|
||||
val_data <- df[!is.na(x.obs), .(w = w_pred, a = a_pred, z = z.obs, y = y, x = x.obs)] |> as.data.frame()
|
||||
|
||||
outline <- run_simulation(y ~ x || w + z ||
|
||||
a, w ~ x + z + y, a ~ x + z + y, data=research_data,data2=val_data, result = result)
|
||||
|
||||
|
||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||
if(file.exists(args$outfile)){
|
||||
logdata <- read_feather(args$outfile)
|
||||
logdata <- rbind(logdata,as.data.table(outline), fill=TRUE)
|
||||
} else {
|
||||
logdata <- as.data.table(outline)
|
||||
}
|
||||
|
||||
print(outline)
|
||||
write_feather(logdata, args$outfile)
|
||||
unlock(outfile_lock)
|
||||
}
|
41
multiple_iv_simulations/Makefile
Normal file
41
multiple_iv_simulations/Makefile
Normal file
@ -0,0 +1,41 @@
|
||||
.ONESHELL:
|
||||
SHELL=bash
|
||||
|
||||
Ns=[1000, 5000, 10000]
|
||||
ms=[100, 200, 400]
|
||||
seeds=[$(shell seq -s, 1 500)]
|
||||
explained_variances=[0.1]
|
||||
|
||||
all:main
|
||||
main:remembr.RDS
|
||||
|
||||
srun=sbatch --wait --verbose run_job.sbatch
|
||||
skx_pylauncher_limit=40
|
||||
|
||||
joblists:example_1_jobs example_2_jobs example_3_jobs
|
||||
|
||||
grid_sweep_script=../simulations/grid_sweep.py
|
||||
|
||||
example_1_jobs: 01_indep_differential.R simulation_base.R ${grid_sweep_script} skx_1.sbatch ../misclassificationmodels
|
||||
${srun} ${grid_sweep_script} --command "Rscript 01_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs
|
||||
|
||||
example_1.feather: example_1_jobs my_pylauncher.py
|
||||
sbatch -J "multiple iv measerr correction" skx_1.sbatch my_pylauncher.py $< --cores 1
|
||||
|
||||
remembr.RDS:example_1.feather
|
||||
# rm -f remembr.RDS
|
||||
# ${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
|
||||
|
||||
|
||||
clean_main:
|
||||
rm -f example_1_jobs
|
||||
rm -f example_1.feather
|
||||
#
|
||||
clean_all: clean_main
|
||||
rm *.feather
|
||||
rm -f remembr.RDS
|
||||
rm -f remembr*.RDS
|
||||
rm -f robustness*.RDS
|
||||
rm -f example_*_jobs
|
||||
rm -f robustness_*_jobs_*
|
||||
|
18
multiple_iv_simulations/my_pylauncher.py
Executable file
18
multiple_iv_simulations/my_pylauncher.py
Executable file
@ -0,0 +1,18 @@
|
||||
#!/scratch/projects/compilers/intel24.0/oneapi/intelpython/python3.9/bin/python3
|
||||
#^ always use TACC's python to start pylauncher.
|
||||
|
||||
import pylauncher
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('jobs', help='list of commands to be run')
|
||||
parser.add_argument('--cores', help='cores to use per job', default=1)
|
||||
parser.add_argument('--timeout', help='timeout length (seconds)', default=2)
|
||||
parser.add_argument('--queuestate', help='pylauncher queuestate (for resuming jobs)',default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.queuestate is not None:
|
||||
pylauncher.ResumeClassicLauncher(args.queuestate, cores=args.cores, timeout=args.timeout)
|
||||
else:
|
||||
pylauncher.ClassicLauncher(args.jobs, cores=args.cores, timeout=args.timeout)
|
118
multiple_iv_simulations/simulation_base.R
Normal file
118
multiple_iv_simulations/simulation_base.R
Normal file
@ -0,0 +1,118 @@
|
||||
options(amelia.parallel="no",
|
||||
amelia.ncpus=1)
|
||||
library(matrixStats) # for numerically stable logsumexps
|
||||
source("../misclassificationmodels/R/likelihoods.R")
|
||||
source("../misclassificationmodels/R/glm_fixit.R")
|
||||
|
||||
## ... are the parts of the formular.
|
||||
run_simulation <- function(..., family=gaussian(),
|
||||
proxy_family = binomial(link='logit'),
|
||||
truth_family = binomial(link='logit'),
|
||||
data,
|
||||
data2,
|
||||
result
|
||||
){
|
||||
|
||||
accuracy <- df[,mean(w_pred==x)]
|
||||
accuracy.x.y0 <- df[y<=0,mean(w_pred==x)]
|
||||
accuracy.x.y1 <- df[y>=0,mean(w_pred==x)]
|
||||
accuracy.z.y0 <- df[y<=0,mean(a_pred==z)]
|
||||
accuracy.z.y1 <- df[y>=0,mean(a_pred==z)]
|
||||
cor.y.xi <- cor(df$x - df$w_pred, df$y)
|
||||
cor.y.zi <- cor(df$z - df$a_pred, df$y)
|
||||
|
||||
fnr <- df[w_pred==0,mean(w_pred!=x)]
|
||||
fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)]
|
||||
fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)]
|
||||
|
||||
fpr <- df[w_pred==1,mean(w_pred!=x)]
|
||||
fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)]
|
||||
fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)]
|
||||
cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred)
|
||||
|
||||
result <- append(result, list(accuracy=accuracy,
|
||||
accuracy.x.y0=accuracy.x.y0,
|
||||
accuracy.x.y1=accuracy.x.y1,
|
||||
accuracy.z.y0=accuracy.z.y0,
|
||||
accuracy.z.y1=accuracy.z.y1,
|
||||
cor.y.xi=cor.y.xi,
|
||||
fnr=fnr,
|
||||
fnr.y0=fnr.y0,
|
||||
fnr.y1=fnr.y1,
|
||||
fpr=fpr,
|
||||
fpr.y0=fpr.y0,
|
||||
fpr.y1=fpr.y1,
|
||||
cor.resid.w_pred=cor.resid.w_pred
|
||||
))
|
||||
|
||||
result <- append(result, list(cor.xz=cor(df$x,df$z)))
|
||||
(model.true <- lm(y ~ x + z, data=df))
|
||||
true.ci.Bxy <- confint(model.true)['x',]
|
||||
true.ci.Bzy <- confint(model.true)['z',]
|
||||
|
||||
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
|
||||
Bzy.est.true=coef(model.true)['z'],
|
||||
Bxy.ci.upper.true = true.ci.Bxy[2],
|
||||
Bxy.ci.lower.true = true.ci.Bxy[1],
|
||||
Bzy.ci.upper.true = true.ci.Bzy[2],
|
||||
Bzy.ci.lower.true = true.ci.Bzy[1]))
|
||||
|
||||
## mle_result <- list(
|
||||
## Bxy.est.naive = NULL,
|
||||
## Bxy.ci.upper.naive = NULL,
|
||||
## Bxy.ci.lower.naive = NULL,
|
||||
## Bzy.est.naive = NULL,
|
||||
## Bzy.ci.upper.naive = NULL,
|
||||
## Bzy.ci.lower.naive = NULL,
|
||||
## Bxy.est.feasible = NULL,
|
||||
## Bxy.ci.upper.feasible = NULL,
|
||||
## Bxy.ci.lower.feasible = NULL,
|
||||
## Bzy.est.feasible = NULL,
|
||||
## Bzy.ci.upper.feasible = NULL,
|
||||
## Bzy.ci.lower.feasible = NULL,
|
||||
## Bxy.est.mle = NULL,
|
||||
## Bxy.ci.upper.mle = NULL,
|
||||
## Bxy.ci.lower.mle = NULL,
|
||||
## Bzy.est.mle = NULL,
|
||||
## Bzy.ci.upper.mle = NULL,
|
||||
## Bzy.ci.lower.mle = NULL
|
||||
## )
|
||||
## tryCatch({
|
||||
mod <- glm_fixit(..., family=family, data=data, data2=data2)
|
||||
|
||||
coef_corrected <- coef(mod, which_model = "corrected")
|
||||
ci_corrected <- confint(mod, which_model = "corrected")
|
||||
|
||||
coef_feasible <- coef(mod, which_model = "feasible")
|
||||
ci_feasible <- confint(mod, which_model = "feasible")
|
||||
|
||||
coef_naive <- coef(mod, which_model = "naive")
|
||||
ci_naive <- confint(mod, which_model = "naive")
|
||||
|
||||
mle_result <- list(
|
||||
Bxy.est.mle = coef_corrected["x"],
|
||||
Bxy.ci.upper.mle = ci_corrected["x", 2],
|
||||
Bxy.ci.lower.mle = ci_corrected["x", 1],
|
||||
Bzy.est.mle = coef_corrected["z"],
|
||||
Bzy.ci.upper.mle = ci_corrected["z", 2],
|
||||
Bzy.ci.lower.mle = ci_corrected["z", 1],
|
||||
Bxy.est.feasible = coef_feasible["x"],
|
||||
Bxy.ci.upper.feasible = ci_feasible["x", 2],
|
||||
Bxy.ci.lower.feasible = ci_feasible["x",1],
|
||||
Bzy.est.feasible = coef_feasible["z"],
|
||||
Bzy.ci.upper.feasible = ci_feasible["z",2],
|
||||
Bzy.ci.lower.feasible = ci_feasible["z",1],
|
||||
Bxy.est.naive = coef_naive["w"],
|
||||
Bxy.ci.upper.naive = ci_naive["w",2],
|
||||
Bxy.ci.lower.naive = ci_naive["w",1],
|
||||
Bzy.est.naive = coef_naive["a"],
|
||||
Bzy.ci.upper.naive = ci_naive["a",2],
|
||||
Bzy.ci.lower.naive = ci_naive["a",1]
|
||||
)
|
||||
## print(mle_result)
|
||||
## },
|
||||
## error=function(e) {result[['error']] <- as.character(e)
|
||||
## })
|
||||
result <- append(result, mle_result)
|
||||
return(result)
|
||||
}
|
12
multiple_iv_simulations/skx_1.sbatch
Normal file
12
multiple_iv_simulations/skx_1.sbatch
Normal file
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --account=ECS24013
|
||||
#SBATCH --partition=skx
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --chdir=/scratch/10114/nathante/critical_mass
|
||||
#SBATCH --error="sbatch_log/%x_%A_%a.err"
|
||||
#SBATCH --output="sbatch_log/%x_%A_%a.out"
|
||||
#SBATCH --nice
|
||||
#SBATCH --time=48:00:00
|
||||
|
||||
source ~/.bashrc
|
||||
"$@"
|
Loading…
Reference in New Issue
Block a user