From d9bc8f8f6151fe767be92a1a303de6044b7f9764 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 5 Jun 2025 13:40:35 -0700 Subject: [PATCH] add simulation to test multiple IV. --- .../01_indep_differential.R | 132 ++++++++++++++++++ multiple_iv_simulations/Makefile | 41 ++++++ multiple_iv_simulations/my_pylauncher.py | 18 +++ multiple_iv_simulations/simulation_base.R | 118 ++++++++++++++++ multiple_iv_simulations/skx_1.sbatch | 12 ++ 5 files changed, 321 insertions(+) create mode 100644 multiple_iv_simulations/01_indep_differential.R create mode 100644 multiple_iv_simulations/Makefile create mode 100755 multiple_iv_simulations/my_pylauncher.py create mode 100644 multiple_iv_simulations/simulation_base.R create mode 100644 multiple_iv_simulations/skx_1.sbatch diff --git a/multiple_iv_simulations/01_indep_differential.R b/multiple_iv_simulations/01_indep_differential.R new file mode 100644 index 0000000..345cf3c --- /dev/null +++ b/multiple_iv_simulations/01_indep_differential.R @@ -0,0 +1,132 @@ +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="no", + amelia.ncpus=1) +setDTthreads(40) + +source("simulation_base.R") + +simulate_data <- function(N, + m, + B0, + Bxy, + Bzx, + Bzy, + seed, + y_explained_variance = 0.025, + prediction_accuracy = 0.73, + y_bias = -0.8, + z_bias = 0, + x_bias = 0, + Px = 0.5, + Pz = 0.5, + accuracy_imbalance_difference = 0.3) { + set.seed(seed) + # make w and y dependent + z <- rbinom(N, 1, plogis(qlogis(Pz))) + x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px))) + + y.var.epsilon <- (var(Bzy * z) + var(Bxy * x) + 2 * cov(Bzy * z, Bxy * x)) * ((1 - y_explained_variance) / y_explained_variance) + y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon)) + y <- Bzy * z + Bxy * x + y.epsilon + + df <- data.table(x = x, y = y, z = z) + + if (m < N) { + df <- df[sample(nrow(df), m), ":="(x.obs = x, z.obs = z)] + } else { + df <- df[, ":="(x.obs = x, z.obs = z)] + } + + resids <- resid(lm(y ~ x + z)) + odds.x1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[x == 1])) + z_bias * qlogis(pnorm(z[x == 1], sd(z))) + odds.x0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[x == 0])) + z_bias * qlogis(pnorm(z[x == 0], sd(z))) + + ## acc.x0 <- p.correct[df[,x==0]] + ## acc.x1 <- p.correct[df[,x==1]] + + df[x == 0, w := plogis(rlogis(.N, odds.x0))] + df[x == 1, w := plogis(rlogis(.N, odds.x1))] + df[, w_pred := as.integer(w > 0.5)] + + ## now let's use another classifier for z + odds.z1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[z == 1])) + x_bias * qlogis(pnorm(x[z == 1], sd(x))) + odds.z0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[z == 0])) + x_bias * qlogis(pnorm(x[z == 0], sd(z))) + + df[z == 0, a := plogis(rlogis(.N, odds.z0))] + df[z == 1, a := plogis(rlogis(.N, odds.z1))] + df[, a_pred := as.integer(a > 0.5)] + + + print(mean(df$w_pred == df$x)) + print(mean(df$w_pred == df$x)) + print(mean(df[y >= 0]$w_pred == df[y >= 0]$x)) + print(mean(df[y <= 0]$w_pred == df[y <= 0]$x)) + return(df) +} + +parser <- arg_parser("Simulate data and fit corrected models with two independent variables that are classified") +parser <- add_argument(parser, "--N", default=5000, help="number of observations of w") +parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=51, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather') +parser <- add_argument(parser, "--y_explained_variance", help='what proportion of the variance of y can be explained?', default=0.1) +parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75) +parser <- add_argument(parser, "--accuracy_imbalance_difference", help='how much more accurate is the predictive model for one class than the other?', default=0.3) +parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3) +parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3) +parser <- add_argument(parser, "--Bxy", help='Effect of z on y', default=0.3) +parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z") +parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*z*x") +parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5) +parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0) +parser <- add_argument(parser, "--x_bias", help='coefficient of x on the probability a classification is correct', default=0) +parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z") +parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5) +parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad') +args <- parse_args(parser) + +B0 <- 0 +Px <- args$Px +Bxy <- args$Bxy +Bzy <- args$Bzy +Bzx <- args$Bzx + +if(args$m < args$N){ + + df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$y_explained_variance, args$prediction_accuracy, y_bias=args$y_bias) + + ## df.pc <- df[,.(x,y,z,w_pred,w)] + ## # df.pc <- df.pc[,err:=x-w_pred] + ## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05) + ## plot(pc.df) + + result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='') + + research_data <- df[is.na(x.obs), .(w = w_pred, a = a_pred, y = y)] |> as.data.frame() + + val_data <- df[!is.na(x.obs), .(w = w_pred, a = a_pred, z = z.obs, y = y, x = x.obs)] |> as.data.frame() + + outline <- run_simulation(y ~ x || w + z || + a, w ~ x + z + y, a ~ x + z + y, data=research_data,data2=val_data, result = result) + + + outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) + if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline), fill=TRUE) + } else { + logdata <- as.data.table(outline) + } + + print(outline) + write_feather(logdata, args$outfile) + unlock(outfile_lock) +} diff --git a/multiple_iv_simulations/Makefile b/multiple_iv_simulations/Makefile new file mode 100644 index 0000000..397fe7e --- /dev/null +++ b/multiple_iv_simulations/Makefile @@ -0,0 +1,41 @@ +.ONESHELL: +SHELL=bash + +Ns=[1000, 5000, 10000] +ms=[100, 200, 400] +seeds=[$(shell seq -s, 1 500)] +explained_variances=[0.1] + +all:main +main:remembr.RDS + +srun=sbatch --wait --verbose run_job.sbatch +skx_pylauncher_limit=40 + +joblists:example_1_jobs example_2_jobs example_3_jobs + +grid_sweep_script=../simulations/grid_sweep.py + +example_1_jobs: 01_indep_differential.R simulation_base.R ${grid_sweep_script} skx_1.sbatch ../misclassificationmodels + ${srun} ${grid_sweep_script} --command "Rscript 01_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs + +example_1.feather: example_1_jobs my_pylauncher.py + sbatch -J "multiple iv measerr correction" skx_1.sbatch my_pylauncher.py $< --cores 1 + +remembr.RDS:example_1.feather +# rm -f remembr.RDS +# ${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1" + + +clean_main: + rm -f example_1_jobs + rm -f example_1.feather +# +clean_all: clean_main + rm *.feather + rm -f remembr.RDS + rm -f remembr*.RDS + rm -f robustness*.RDS + rm -f example_*_jobs + rm -f robustness_*_jobs_* + diff --git a/multiple_iv_simulations/my_pylauncher.py b/multiple_iv_simulations/my_pylauncher.py new file mode 100755 index 0000000..02fb5ed --- /dev/null +++ b/multiple_iv_simulations/my_pylauncher.py @@ -0,0 +1,18 @@ +#!/scratch/projects/compilers/intel24.0/oneapi/intelpython/python3.9/bin/python3 +#^ always use TACC's python to start pylauncher. + +import pylauncher +import argparse +import sys + +parser = argparse.ArgumentParser() +parser.add_argument('jobs', help='list of commands to be run') +parser.add_argument('--cores', help='cores to use per job', default=1) +parser.add_argument('--timeout', help='timeout length (seconds)', default=2) +parser.add_argument('--queuestate', help='pylauncher queuestate (for resuming jobs)',default=None) + +args = parser.parse_args() +if args.queuestate is not None: + pylauncher.ResumeClassicLauncher(args.queuestate, cores=args.cores, timeout=args.timeout) +else: + pylauncher.ClassicLauncher(args.jobs, cores=args.cores, timeout=args.timeout) diff --git a/multiple_iv_simulations/simulation_base.R b/multiple_iv_simulations/simulation_base.R new file mode 100644 index 0000000..9241827 --- /dev/null +++ b/multiple_iv_simulations/simulation_base.R @@ -0,0 +1,118 @@ +options(amelia.parallel="no", + amelia.ncpus=1) +library(matrixStats) # for numerically stable logsumexps +source("../misclassificationmodels/R/likelihoods.R") +source("../misclassificationmodels/R/glm_fixit.R") + +## ... are the parts of the formular. +run_simulation <- function(..., family=gaussian(), + proxy_family = binomial(link='logit'), + truth_family = binomial(link='logit'), + data, + data2, + result + ){ + + accuracy <- df[,mean(w_pred==x)] + accuracy.x.y0 <- df[y<=0,mean(w_pred==x)] + accuracy.x.y1 <- df[y>=0,mean(w_pred==x)] + accuracy.z.y0 <- df[y<=0,mean(a_pred==z)] + accuracy.z.y1 <- df[y>=0,mean(a_pred==z)] + cor.y.xi <- cor(df$x - df$w_pred, df$y) + cor.y.zi <- cor(df$z - df$a_pred, df$y) + + fnr <- df[w_pred==0,mean(w_pred!=x)] + fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)] + fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)] + + fpr <- df[w_pred==1,mean(w_pred!=x)] + fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)] + fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)] + cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred) + + result <- append(result, list(accuracy=accuracy, + accuracy.x.y0=accuracy.x.y0, + accuracy.x.y1=accuracy.x.y1, + accuracy.z.y0=accuracy.z.y0, + accuracy.z.y1=accuracy.z.y1, + cor.y.xi=cor.y.xi, + fnr=fnr, + fnr.y0=fnr.y0, + fnr.y1=fnr.y1, + fpr=fpr, + fpr.y0=fpr.y0, + fpr.y1=fpr.y1, + cor.resid.w_pred=cor.resid.w_pred + )) + + result <- append(result, list(cor.xz=cor(df$x,df$z))) + (model.true <- lm(y ~ x + z, data=df)) + true.ci.Bxy <- confint(model.true)['x',] + true.ci.Bzy <- confint(model.true)['z',] + + result <- append(result, list(Bxy.est.true=coef(model.true)['x'], + Bzy.est.true=coef(model.true)['z'], + Bxy.ci.upper.true = true.ci.Bxy[2], + Bxy.ci.lower.true = true.ci.Bxy[1], + Bzy.ci.upper.true = true.ci.Bzy[2], + Bzy.ci.lower.true = true.ci.Bzy[1])) + + ## mle_result <- list( + ## Bxy.est.naive = NULL, + ## Bxy.ci.upper.naive = NULL, + ## Bxy.ci.lower.naive = NULL, + ## Bzy.est.naive = NULL, + ## Bzy.ci.upper.naive = NULL, + ## Bzy.ci.lower.naive = NULL, + ## Bxy.est.feasible = NULL, + ## Bxy.ci.upper.feasible = NULL, + ## Bxy.ci.lower.feasible = NULL, + ## Bzy.est.feasible = NULL, + ## Bzy.ci.upper.feasible = NULL, + ## Bzy.ci.lower.feasible = NULL, + ## Bxy.est.mle = NULL, + ## Bxy.ci.upper.mle = NULL, + ## Bxy.ci.lower.mle = NULL, + ## Bzy.est.mle = NULL, + ## Bzy.ci.upper.mle = NULL, + ## Bzy.ci.lower.mle = NULL + ## ) + ## tryCatch({ + mod <- glm_fixit(..., family=family, data=data, data2=data2) + + coef_corrected <- coef(mod, which_model = "corrected") + ci_corrected <- confint(mod, which_model = "corrected") + + coef_feasible <- coef(mod, which_model = "feasible") + ci_feasible <- confint(mod, which_model = "feasible") + + coef_naive <- coef(mod, which_model = "naive") + ci_naive <- confint(mod, which_model = "naive") + + mle_result <- list( + Bxy.est.mle = coef_corrected["x"], + Bxy.ci.upper.mle = ci_corrected["x", 2], + Bxy.ci.lower.mle = ci_corrected["x", 1], + Bzy.est.mle = coef_corrected["z"], + Bzy.ci.upper.mle = ci_corrected["z", 2], + Bzy.ci.lower.mle = ci_corrected["z", 1], + Bxy.est.feasible = coef_feasible["x"], + Bxy.ci.upper.feasible = ci_feasible["x", 2], + Bxy.ci.lower.feasible = ci_feasible["x",1], + Bzy.est.feasible = coef_feasible["z"], + Bzy.ci.upper.feasible = ci_feasible["z",2], + Bzy.ci.lower.feasible = ci_feasible["z",1], + Bxy.est.naive = coef_naive["w"], + Bxy.ci.upper.naive = ci_naive["w",2], + Bxy.ci.lower.naive = ci_naive["w",1], + Bzy.est.naive = coef_naive["a"], + Bzy.ci.upper.naive = ci_naive["a",2], + Bzy.ci.lower.naive = ci_naive["a",1] + ) + ## print(mle_result) + ## }, + ## error=function(e) {result[['error']] <- as.character(e) + ## }) + result <- append(result, mle_result) + return(result) +} diff --git a/multiple_iv_simulations/skx_1.sbatch b/multiple_iv_simulations/skx_1.sbatch new file mode 100644 index 0000000..f12dceb --- /dev/null +++ b/multiple_iv_simulations/skx_1.sbatch @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --account=ECS24013 +#SBATCH --partition=skx +#SBATCH --nodes=1 +#SBATCH --chdir=/scratch/10114/nathante/critical_mass +#SBATCH --error="sbatch_log/%x_%A_%a.err" +#SBATCH --output="sbatch_log/%x_%A_%a.out" +#SBATCH --nice +#SBATCH --time=48:00:00 + +source ~/.bashrc +"$@"