add simulation to test multiple IV.

2025-06-05 13:40:35 -07:00
parent 33d758c2dd
commit d9bc8f8f61
5 changed files with 321 additions and 0 deletions
--- a/multiple_iv_simulations/01_indep_differential.R
+++ b/multiple_iv_simulations/01_indep_differential.R
@@ -0,0 +1,132 @@
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+library(predictionError)
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+setDTthreads(40)
+
+source("simulation_base.R")
+
+simulate_data <- function(N,
+                          m,
+                          B0,
+                          Bxy,
+                          Bzx,
+                          Bzy,
+                          seed,
+                          y_explained_variance = 0.025,
+                          prediction_accuracy = 0.73,
+                          y_bias = -0.8,
+                          z_bias = 0,
+                          x_bias = 0,
+                          Px = 0.5,
+                          Pz = 0.5,
+                          accuracy_imbalance_difference = 0.3) {
+  set.seed(seed)
+  # make w and y dependent
+  z <- rbinom(N, 1, plogis(qlogis(Pz)))
+  x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
+
+  y.var.epsilon <- (var(Bzy * z) + var(Bxy * x) + 2 * cov(Bzy * z, Bxy * x)) * ((1 - y_explained_variance) / y_explained_variance)
+  y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
+  y <- Bzy * z + Bxy * x + y.epsilon
+
+  df <- data.table(x = x, y = y, z = z)
+
+  if (m < N) {
+    df <- df[sample(nrow(df), m), ":="(x.obs = x, z.obs = z)]
+  } else {
+    df <- df[, ":="(x.obs = x, z.obs = z)]
+  }
+
+  resids <- resid(lm(y ~ x + z))
+  odds.x1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[x == 1])) + z_bias * qlogis(pnorm(z[x == 1], sd(z)))
+  odds.x0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[x == 0])) + z_bias * qlogis(pnorm(z[x == 0], sd(z)))
+
+  ## acc.x0 <- p.correct[df[,x==0]]
+  ## acc.x1 <- p.correct[df[,x==1]]
+
+  df[x == 0, w := plogis(rlogis(.N, odds.x0))]
+  df[x == 1, w := plogis(rlogis(.N, odds.x1))]
+  df[, w_pred := as.integer(w > 0.5)]
+
+  ## now let's use another classifier for z
+  odds.z1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[z == 1])) + x_bias * qlogis(pnorm(x[z == 1], sd(x)))
+  odds.z0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[z == 0])) + x_bias * qlogis(pnorm(x[z == 0], sd(z)))
+
+  df[z == 0, a := plogis(rlogis(.N, odds.z0))]
+  df[z == 1, a := plogis(rlogis(.N, odds.z1))]
+  df[, a_pred := as.integer(a > 0.5)]
+
+
+  print(mean(df$w_pred == df$x))
+  print(mean(df$w_pred == df$x))
+  print(mean(df[y >= 0]$w_pred == df[y >= 0]$x))
+  print(mean(df[y <= 0]$w_pred == df[y <= 0]$x))
+  return(df)
+}
+
+parser <- arg_parser("Simulate data and fit corrected models with two independent variables that are classified")
+parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
+parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
+parser <- add_argument(parser, "--seed", default=51, help='seed for the rng')
+parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
+parser <- add_argument(parser, "--y_explained_variance", help='what proportion of the variance of y can be explained?', default=0.1)
+parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75)
+parser <- add_argument(parser, "--accuracy_imbalance_difference", help='how much more accurate is the predictive model for one class than the other?', default=0.3)
+parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
+parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
+parser <- add_argument(parser, "--Bxy", help='Effect of z on y', default=0.3)
+parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
+parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*z*x")
+parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
+parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
+parser <- add_argument(parser, "--x_bias", help='coefficient of x on the probability a classification is correct', default=0)
+parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
+parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
+parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
+args <- parse_args(parser)
+
+B0 <- 0
+Px <- args$Px
+Bxy <- args$Bxy
+Bzy <- args$Bzy
+Bzx <- args$Bzx
+
+if(args$m < args$N){
+
+    df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$y_explained_variance, args$prediction_accuracy, y_bias=args$y_bias)
+
+    ## df.pc <- df[,.(x,y,z,w_pred,w)]
+    ##                                     #    df.pc <- df.pc[,err:=x-w_pred]
+    ## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
+    ## plot(pc.df)
+
+    result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='')
+
+  research_data <- df[is.na(x.obs), .(w = w_pred, a = a_pred, y = y)] |> as.data.frame()
+
+  val_data <- df[!is.na(x.obs), .(w = w_pred, a = a_pred, z = z.obs, y = y, x = x.obs)] |> as.data.frame()
+
+  outline <- run_simulation(y ~ x || w + z ||
+    a, w ~ x + z + y, a ~ x + z + y, data=research_data,data2=val_data, result = result)
+    
+   
+ outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
+    if(file.exists(args$outfile)){
+        logdata <- read_feather(args$outfile)
+        logdata <- rbind(logdata,as.data.table(outline), fill=TRUE)
+    } else {
+        logdata <- as.data.table(outline)
+    }
+
+    print(outline)
+    write_feather(logdata, args$outfile)
+    unlock(outfile_lock)
+}
--- a/multiple_iv_simulations/Makefile
+++ b/multiple_iv_simulations/Makefile
@@ -0,0 +1,41 @@
+.ONESHELL:
+SHELL=bash
+
+Ns=[1000, 5000, 10000]
+ms=[100, 200, 400]
+seeds=[$(shell seq -s, 1 500)]
+explained_variances=[0.1]
+
+all:main
+main:remembr.RDS 
+
+srun=sbatch --wait --verbose run_job.sbatch
+skx_pylauncher_limit=40
+
+joblists:example_1_jobs example_2_jobs example_3_jobs
+
+grid_sweep_script=../simulations/grid_sweep.py 
+
+example_1_jobs: 01_indep_differential.R simulation_base.R ${grid_sweep_script} skx_1.sbatch ../misclassificationmodels
+	${srun} ${grid_sweep_script} --command "Rscript 01_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs
+
+example_1.feather: example_1_jobs my_pylauncher.py
+	sbatch -J "multiple iv measerr correction" skx_1.sbatch my_pylauncher.py $< --cores 1
+
+remembr.RDS:example_1.feather 
+# 	rm -f remembr.RDS
+# 	${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
+
+
+clean_main:
+	rm -f example_1_jobs
+	rm -f example_1.feather
+#	
+clean_all: clean_main
+	rm *.feather
+	rm -f remembr.RDS
+	rm -f remembr*.RDS
+	rm -f robustness*.RDS
+	rm -f example_*_jobs
+	rm -f robustness_*_jobs_*
+
--- a/multiple_iv_simulations/my_pylauncher.py
+++ b/multiple_iv_simulations/my_pylauncher.py
@@ -0,0 +1,18 @@
+#!/scratch/projects/compilers/intel24.0/oneapi/intelpython/python3.9/bin/python3
+#^ always use TACC's python to start pylauncher. 
+
+import pylauncher 
+import argparse
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument('jobs', help='list of commands to be run')
+parser.add_argument('--cores', help='cores to use per job', default=1)
+parser.add_argument('--timeout', help='timeout length (seconds)', default=2)
+parser.add_argument('--queuestate', help='pylauncher queuestate (for resuming jobs)',default=None)
+
+args = parser.parse_args()
+if args.queuestate is not None:
+    pylauncher.ResumeClassicLauncher(args.queuestate, cores=args.cores, timeout=args.timeout)
+else:
+    pylauncher.ClassicLauncher(args.jobs, cores=args.cores, timeout=args.timeout)
--- a/multiple_iv_simulations/simulation_base.R
+++ b/multiple_iv_simulations/simulation_base.R
@@ -0,0 +1,118 @@
+options(amelia.parallel="no",
+        amelia.ncpus=1)
+library(matrixStats) # for numerically stable logsumexps
+source("../misclassificationmodels/R/likelihoods.R")
+source("../misclassificationmodels/R/glm_fixit.R")
+
+## ... are the parts of the formular.
+run_simulation <-  function(..., family=gaussian(),
+                      proxy_family = binomial(link='logit'),
+                      truth_family = binomial(link='logit'),
+                      data,
+                      data2,
+                      result
+                      ){
+
+    accuracy <- df[,mean(w_pred==x)]
+    accuracy.x.y0 <- df[y<=0,mean(w_pred==x)]
+    accuracy.x.y1 <- df[y>=0,mean(w_pred==x)]
+    accuracy.z.y0 <- df[y<=0,mean(a_pred==z)]
+    accuracy.z.y1 <- df[y>=0,mean(a_pred==z)]
+    cor.y.xi <- cor(df$x - df$w_pred, df$y)
+    cor.y.zi <- cor(df$z - df$a_pred, df$y)
+
+    fnr <- df[w_pred==0,mean(w_pred!=x)]
+    fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)]
+    fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)]
+
+    fpr <- df[w_pred==1,mean(w_pred!=x)]
+    fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)]
+    fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)]
+    cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred)
+
+    result <- append(result, list(accuracy=accuracy,
+                                  accuracy.x.y0=accuracy.x.y0,
+                                  accuracy.x.y1=accuracy.x.y1,
+                                  accuracy.z.y0=accuracy.z.y0,
+                                  accuracy.z.y1=accuracy.z.y1,
+                                  cor.y.xi=cor.y.xi,
+                                  fnr=fnr,
+                                  fnr.y0=fnr.y0,
+                                  fnr.y1=fnr.y1,
+                                  fpr=fpr,
+                                  fpr.y0=fpr.y0,
+                                  fpr.y1=fpr.y1,
+                                  cor.resid.w_pred=cor.resid.w_pred
+                                  ))
+
+    result <- append(result, list(cor.xz=cor(df$x,df$z)))
+    (model.true <- lm(y ~ x + z, data=df))
+    true.ci.Bxy <- confint(model.true)['x',]
+    true.ci.Bzy <- confint(model.true)['z',]
+
+    result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+                                  Bzy.est.true=coef(model.true)['z'],
+                                  Bxy.ci.upper.true = true.ci.Bxy[2],
+                                  Bxy.ci.lower.true = true.ci.Bxy[1],
+                                  Bzy.ci.upper.true = true.ci.Bzy[2],
+                                  Bzy.ci.lower.true = true.ci.Bzy[1]))
+                                  
+  ## mle_result <- list(
+  ##   Bxy.est.naive = NULL,
+  ##   Bxy.ci.upper.naive = NULL,
+  ##   Bxy.ci.lower.naive = NULL,
+  ##   Bzy.est.naive = NULL,
+  ##   Bzy.ci.upper.naive = NULL,
+  ##   Bzy.ci.lower.naive = NULL,
+  ##   Bxy.est.feasible = NULL,
+  ##   Bxy.ci.upper.feasible = NULL,
+  ##   Bxy.ci.lower.feasible = NULL,
+  ##   Bzy.est.feasible = NULL,
+  ##   Bzy.ci.upper.feasible = NULL,
+  ##   Bzy.ci.lower.feasible = NULL,
+  ##   Bxy.est.mle = NULL,
+  ##   Bxy.ci.upper.mle = NULL,
+  ##   Bxy.ci.lower.mle = NULL,
+  ##   Bzy.est.mle = NULL,
+  ##   Bzy.ci.upper.mle = NULL,
+  ##   Bzy.ci.lower.mle = NULL
+  ## )
+  ## tryCatch({
+    mod <- glm_fixit(..., family=family, data=data, data2=data2)
+  
+    coef_corrected <- coef(mod, which_model = "corrected")
+    ci_corrected <- confint(mod, which_model = "corrected")
+    
+    coef_feasible <- coef(mod, which_model = "feasible")
+    ci_feasible <- confint(mod, which_model = "feasible")
+    
+    coef_naive <- coef(mod, which_model = "naive")
+    ci_naive <- confint(mod, which_model = "naive")
+    
+    mle_result <- list(
+      Bxy.est.mle = coef_corrected["x"],
+      Bxy.ci.upper.mle = ci_corrected["x", 2],
+      Bxy.ci.lower.mle = ci_corrected["x", 1],
+      Bzy.est.mle = coef_corrected["z"],
+      Bzy.ci.upper.mle = ci_corrected["z", 2],
+      Bzy.ci.lower.mle = ci_corrected["z", 1],
+      Bxy.est.feasible = coef_feasible["x"],
+      Bxy.ci.upper.feasible = ci_feasible["x", 2],
+      Bxy.ci.lower.feasible = ci_feasible["x",1],
+      Bzy.est.feasible = coef_feasible["z"],
+      Bzy.ci.upper.feasible = ci_feasible["z",2],
+      Bzy.ci.lower.feasible = ci_feasible["z",1],
+      Bxy.est.naive = coef_naive["w"],
+      Bxy.ci.upper.naive = ci_naive["w",2],
+      Bxy.ci.lower.naive = ci_naive["w",1],
+      Bzy.est.naive = coef_naive["a"],
+      Bzy.ci.upper.naive = ci_naive["a",2],
+      Bzy.ci.lower.naive = ci_naive["a",1]
+    )
+  ##     print(mle_result)
+  ## },
+  ## error=function(e) {result[['error']] <- as.character(e)
+  ## })
+  result <- append(result, mle_result)
+  return(result)
+}
--- a/multiple_iv_simulations/skx_1.sbatch
+++ b/multiple_iv_simulations/skx_1.sbatch
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH --account=ECS24013
+#SBATCH --partition=skx
+#SBATCH --nodes=1
+#SBATCH --chdir=/scratch/10114/nathante/critical_mass
+#SBATCH --error="sbatch_log/%x_%A_%a.err"
+#SBATCH --output="sbatch_log/%x_%A_%a.out"
+#SBATCH --nice
+#SBATCH --time=48:00:00
+
+source ~/.bashrc
+"$@"