add simulation to test multiple IV.

2025-06-05 13:40:35 -07:00 · 2025-06-05 13:40:35 -07:00 · d9bc8f8f61
commit d9bc8f8f61
parent 33d758c2dd
5 changed files with 321 additions and 0 deletions
--- a/multiple_iv_simulations/01_indep_differential.R
+++ b/multiple_iv_simulations/01_indep_differential.R
@ -0,0 +1,132 @@
 library(argparser)
 library(mecor)
 library(ggplot2)
 library(data.table)
 library(filelock)
 library(arrow)
 library(Amelia)
 library(Zelig)
 library(predictionError)
 options(amelia.parallel="no",
        amelia.ncpus=1)
 setDTthreads(40)
 source("simulation_base.R")
 simulate_data <- function(N,
                          m,
                          B0,
                          Bxy,
                          Bzx,
                          Bzy,
                          seed,
                          y_explained_variance = 0.025,
                          prediction_accuracy = 0.73,
                          y_bias = -0.8,
                          z_bias = 0,
                          x_bias = 0,
                          Px = 0.5,
                          Pz = 0.5,
                          accuracy_imbalance_difference = 0.3) {
  set.seed(seed)
  # make w and y dependent
  z <- rbinom(N, 1, plogis(qlogis(Pz)))
  x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
  y.var.epsilon <- (var(Bzy * z) + var(Bxy * x) + 2 * cov(Bzy * z, Bxy * x)) * ((1 - y_explained_variance) / y_explained_variance)
  y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
  y <- Bzy * z + Bxy * x + y.epsilon
  df <- data.table(x = x, y = y, z = z)
  if (m < N) {
    df <- df[sample(nrow(df), m), ":="(x.obs = x, z.obs = z)]
  } else {
    df <- df[, ":="(x.obs = x, z.obs = z)]
  }
  resids <- resid(lm(y ~ x + z))
  odds.x1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[x == 1])) + z_bias * qlogis(pnorm(z[x == 1], sd(z)))
  odds.x0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[x == 0])) + z_bias * qlogis(pnorm(z[x == 0], sd(z)))
  ## acc.x0 <- p.correct[df[,x==0]]
  ## acc.x1 <- p.correct[df[,x==1]]
  df[x == 0, w := plogis(rlogis(.N, odds.x0))]
  df[x == 1, w := plogis(rlogis(.N, odds.x1))]
  df[, w_pred := as.integer(w > 0.5)]
  ## now let's use another classifier for z
  odds.z1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[z == 1])) + x_bias * qlogis(pnorm(x[z == 1], sd(x)))
  odds.z0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[z == 0])) + x_bias * qlogis(pnorm(x[z == 0], sd(z)))
  df[z == 0, a := plogis(rlogis(.N, odds.z0))]
  df[z == 1, a := plogis(rlogis(.N, odds.z1))]
  df[, a_pred := as.integer(a > 0.5)]
  print(mean(df$w_pred == df$x))
  print(mean(df$w_pred == df$x))
  print(mean(df[y >= 0]$w_pred == df[y >= 0]$x))
  print(mean(df[y <= 0]$w_pred == df[y <= 0]$x))
  return(df)
 }
 parser <- arg_parser("Simulate data and fit corrected models with two independent variables that are classified")
 parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
 parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
 parser <- add_argument(parser, "--seed", default=51, help='seed for the rng')
 parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
 parser <- add_argument(parser, "--y_explained_variance", help='what proportion of the variance of y can be explained?', default=0.1)
 parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75)
 parser <- add_argument(parser, "--accuracy_imbalance_difference", help='how much more accurate is the predictive model for one class than the other?', default=0.3)
 parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
 parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
 parser <- add_argument(parser, "--Bxy", help='Effect of z on y', default=0.3)
 parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
 parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*z*x")
 parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
 parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
 parser <- add_argument(parser, "--x_bias", help='coefficient of x on the probability a classification is correct', default=0)
 parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
 parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
 parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
 args <- parse_args(parser)
 B0 <- 0
 Px <- args$Px
 Bxy <- args$Bxy
 Bzy <- args$Bzy
 Bzx <- args$Bzx
 if(args$m < args$N){
    df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$y_explained_variance, args$prediction_accuracy, y_bias=args$y_bias)
    ## df.pc <- df[,.(x,y,z,w_pred,w)]
    ##                                     #    df.pc <- df.pc[,err:=x-w_pred]
    ## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
    ## plot(pc.df)
    result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='')
  research_data <- df[is.na(x.obs), .(w = w_pred, a = a_pred, y = y)] |> as.data.frame()
  val_data <- df[!is.na(x.obs), .(w = w_pred, a = a_pred, z = z.obs, y = y, x = x.obs)] |> as.data.frame()
  outline <- run_simulation(y ~ x || w + z ||
    a, w ~ x + z + y, a ~ x + z + y, data=research_data,data2=val_data, result = result)
 outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
    if(file.exists(args$outfile)){
        logdata <- read_feather(args$outfile)
        logdata <- rbind(logdata,as.data.table(outline), fill=TRUE)
    } else {
        logdata <- as.data.table(outline)
    }
    print(outline)
    write_feather(logdata, args$outfile)
    unlock(outfile_lock)
 }
--- a/multiple_iv_simulations/Makefile
+++ b/multiple_iv_simulations/Makefile
@ -0,0 +1,41 @@
 .ONESHELL:
 SHELL=bash
 Ns=[1000, 5000, 10000]
 ms=[100, 200, 400]
 seeds=[$(shell seq -s, 1 500)]
 explained_variances=[0.1]
 all:main
 main:remembr.RDS 
 srun=sbatch --wait --verbose run_job.sbatch
 skx_pylauncher_limit=40
 joblists:example_1_jobs example_2_jobs example_3_jobs
 grid_sweep_script=../simulations/grid_sweep.py 
 example_1_jobs: 01_indep_differential.R simulation_base.R ${grid_sweep_script} skx_1.sbatch ../misclassificationmodels
 	${srun} ${grid_sweep_script} --command "Rscript 01_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs
 example_1.feather: example_1_jobs my_pylauncher.py
 	sbatch -J "multiple iv measerr correction" skx_1.sbatch my_pylauncher.py $< --cores 1
 remembr.RDS:example_1.feather 
 # 	rm -f remembr.RDS
 # 	${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
 clean_main:
 	rm -f example_1_jobs
 	rm -f example_1.feather
 #	
 clean_all: clean_main
 	rm *.feather
 	rm -f remembr.RDS
 	rm -f remembr*.RDS
 	rm -f robustness*.RDS
 	rm -f example_*_jobs
 	rm -f robustness_*_jobs_*
--- a/multiple_iv_simulations/my_pylauncher.py
+++ b/multiple_iv_simulations/my_pylauncher.py
@ -0,0 +1,18 @@
 #!/scratch/projects/compilers/intel24.0/oneapi/intelpython/python3.9/bin/python3
 #^ always use TACC's python to start pylauncher. 
 import pylauncher 
 import argparse
 import sys
 parser = argparse.ArgumentParser()
 parser.add_argument('jobs', help='list of commands to be run')
 parser.add_argument('--cores', help='cores to use per job', default=1)
 parser.add_argument('--timeout', help='timeout length (seconds)', default=2)
 parser.add_argument('--queuestate', help='pylauncher queuestate (for resuming jobs)',default=None)
 args = parser.parse_args()
 if args.queuestate is not None:
    pylauncher.ResumeClassicLauncher(args.queuestate, cores=args.cores, timeout=args.timeout)
 else:
    pylauncher.ClassicLauncher(args.jobs, cores=args.cores, timeout=args.timeout)
--- a/multiple_iv_simulations/simulation_base.R
+++ b/multiple_iv_simulations/simulation_base.R
@ -0,0 +1,118 @@
 options(amelia.parallel="no",
        amelia.ncpus=1)
 library(matrixStats) # for numerically stable logsumexps
 source("../misclassificationmodels/R/likelihoods.R")
 source("../misclassificationmodels/R/glm_fixit.R")
 ## ... are the parts of the formular.
 run_simulation <-  function(..., family=gaussian(),
                      proxy_family = binomial(link='logit'),
                      truth_family = binomial(link='logit'),
                      data,
                      data2,
                      result
                      ){
    accuracy <- df[,mean(w_pred==x)]
    accuracy.x.y0 <- df[y<=0,mean(w_pred==x)]
    accuracy.x.y1 <- df[y>=0,mean(w_pred==x)]
    accuracy.z.y0 <- df[y<=0,mean(a_pred==z)]
    accuracy.z.y1 <- df[y>=0,mean(a_pred==z)]
    cor.y.xi <- cor(df$x - df$w_pred, df$y)
    cor.y.zi <- cor(df$z - df$a_pred, df$y)
    fnr <- df[w_pred==0,mean(w_pred!=x)]
    fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)]
    fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)]
    fpr <- df[w_pred==1,mean(w_pred!=x)]
    fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)]
    fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)]
    cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred)
    result <- append(result, list(accuracy=accuracy,
                                  accuracy.x.y0=accuracy.x.y0,
                                  accuracy.x.y1=accuracy.x.y1,
                                  accuracy.z.y0=accuracy.z.y0,
                                  accuracy.z.y1=accuracy.z.y1,
                                  cor.y.xi=cor.y.xi,
                                  fnr=fnr,
                                  fnr.y0=fnr.y0,
                                  fnr.y1=fnr.y1,
                                  fpr=fpr,
                                  fpr.y0=fpr.y0,
                                  fpr.y1=fpr.y1,
                                  cor.resid.w_pred=cor.resid.w_pred
                                  ))
    result <- append(result, list(cor.xz=cor(df$x,df$z)))
    (model.true <- lm(y ~ x + z, data=df))
    true.ci.Bxy <- confint(model.true)['x',]
    true.ci.Bzy <- confint(model.true)['z',]
    result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
                                  Bzy.est.true=coef(model.true)['z'],
                                  Bxy.ci.upper.true = true.ci.Bxy[2],
                                  Bxy.ci.lower.true = true.ci.Bxy[1],
                                  Bzy.ci.upper.true = true.ci.Bzy[2],
                                  Bzy.ci.lower.true = true.ci.Bzy[1]))
  ## mle_result <- list(
  ##   Bxy.est.naive = NULL,
  ##   Bxy.ci.upper.naive = NULL,
  ##   Bxy.ci.lower.naive = NULL,
  ##   Bzy.est.naive = NULL,
  ##   Bzy.ci.upper.naive = NULL,
  ##   Bzy.ci.lower.naive = NULL,
  ##   Bxy.est.feasible = NULL,
  ##   Bxy.ci.upper.feasible = NULL,
  ##   Bxy.ci.lower.feasible = NULL,
  ##   Bzy.est.feasible = NULL,
  ##   Bzy.ci.upper.feasible = NULL,
  ##   Bzy.ci.lower.feasible = NULL,
  ##   Bxy.est.mle = NULL,
  ##   Bxy.ci.upper.mle = NULL,
  ##   Bxy.ci.lower.mle = NULL,
  ##   Bzy.est.mle = NULL,
  ##   Bzy.ci.upper.mle = NULL,
  ##   Bzy.ci.lower.mle = NULL
  ## )
  ## tryCatch({
    mod <- glm_fixit(..., family=family, data=data, data2=data2)
    coef_corrected <- coef(mod, which_model = "corrected")
    ci_corrected <- confint(mod, which_model = "corrected")
    coef_feasible <- coef(mod, which_model = "feasible")
    ci_feasible <- confint(mod, which_model = "feasible")
    coef_naive <- coef(mod, which_model = "naive")
    ci_naive <- confint(mod, which_model = "naive")
    mle_result <- list(
      Bxy.est.mle = coef_corrected["x"],
      Bxy.ci.upper.mle = ci_corrected["x", 2],
      Bxy.ci.lower.mle = ci_corrected["x", 1],
      Bzy.est.mle = coef_corrected["z"],
      Bzy.ci.upper.mle = ci_corrected["z", 2],
      Bzy.ci.lower.mle = ci_corrected["z", 1],
      Bxy.est.feasible = coef_feasible["x"],
      Bxy.ci.upper.feasible = ci_feasible["x", 2],
      Bxy.ci.lower.feasible = ci_feasible["x",1],
      Bzy.est.feasible = coef_feasible["z"],
      Bzy.ci.upper.feasible = ci_feasible["z",2],
      Bzy.ci.lower.feasible = ci_feasible["z",1],
      Bxy.est.naive = coef_naive["w"],
      Bxy.ci.upper.naive = ci_naive["w",2],
      Bxy.ci.lower.naive = ci_naive["w",1],
      Bzy.est.naive = coef_naive["a"],
      Bzy.ci.upper.naive = ci_naive["a",2],
      Bzy.ci.lower.naive = ci_naive["a",1]
    )
  ##     print(mle_result)
  ## },
  ## error=function(e) {result[['error']] <- as.character(e)
  ## })
  result <- append(result, mle_result)
  return(result)
 }
--- a/multiple_iv_simulations/skx_1.sbatch
+++ b/multiple_iv_simulations/skx_1.sbatch
@ -0,0 +1,12 @@
 #!/bin/bash
 #SBATCH --account=ECS24013
 #SBATCH --partition=skx
 #SBATCH --nodes=1
 #SBATCH --chdir=/scratch/10114/nathante/critical_mass
 #SBATCH --error="sbatch_log/%x_%A_%a.err"
 #SBATCH --output="sbatch_log/%x_%A_%a.out"
 #SBATCH --nice
 #SBATCH --time=48:00:00
 source ~/.bashrc
 "$@"