1
0

add simulation to test multiple IV.

This commit is contained in:
Nathan TeBlunthuis 2025-06-05 13:40:35 -07:00
parent 33d758c2dd
commit d9bc8f8f61
5 changed files with 321 additions and 0 deletions

View File

@ -0,0 +1,132 @@
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
library(predictionError)
options(amelia.parallel="no",
amelia.ncpus=1)
setDTthreads(40)
source("simulation_base.R")
simulate_data <- function(N,
m,
B0,
Bxy,
Bzx,
Bzy,
seed,
y_explained_variance = 0.025,
prediction_accuracy = 0.73,
y_bias = -0.8,
z_bias = 0,
x_bias = 0,
Px = 0.5,
Pz = 0.5,
accuracy_imbalance_difference = 0.3) {
set.seed(seed)
# make w and y dependent
z <- rbinom(N, 1, plogis(qlogis(Pz)))
x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
y.var.epsilon <- (var(Bzy * z) + var(Bxy * x) + 2 * cov(Bzy * z, Bxy * x)) * ((1 - y_explained_variance) / y_explained_variance)
y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
y <- Bzy * z + Bxy * x + y.epsilon
df <- data.table(x = x, y = y, z = z)
if (m < N) {
df <- df[sample(nrow(df), m), ":="(x.obs = x, z.obs = z)]
} else {
df <- df[, ":="(x.obs = x, z.obs = z)]
}
resids <- resid(lm(y ~ x + z))
odds.x1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[x == 1])) + z_bias * qlogis(pnorm(z[x == 1], sd(z)))
odds.x0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[x == 0])) + z_bias * qlogis(pnorm(z[x == 0], sd(z)))
## acc.x0 <- p.correct[df[,x==0]]
## acc.x1 <- p.correct[df[,x==1]]
df[x == 0, w := plogis(rlogis(.N, odds.x0))]
df[x == 1, w := plogis(rlogis(.N, odds.x1))]
df[, w_pred := as.integer(w > 0.5)]
## now let's use another classifier for z
odds.z1 <- qlogis(prediction_accuracy) + y_bias * qlogis(pnorm(resids[z == 1])) + x_bias * qlogis(pnorm(x[z == 1], sd(x)))
odds.z0 <- qlogis(prediction_accuracy, lower.tail = F) + y_bias * qlogis(pnorm(resids[z == 0])) + x_bias * qlogis(pnorm(x[z == 0], sd(z)))
df[z == 0, a := plogis(rlogis(.N, odds.z0))]
df[z == 1, a := plogis(rlogis(.N, odds.z1))]
df[, a_pred := as.integer(a > 0.5)]
print(mean(df$w_pred == df$x))
print(mean(df$w_pred == df$x))
print(mean(df[y >= 0]$w_pred == df[y >= 0]$x))
print(mean(df[y <= 0]$w_pred == df[y <= 0]$x))
return(df)
}
parser <- arg_parser("Simulate data and fit corrected models with two independent variables that are classified")
parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
parser <- add_argument(parser, "--seed", default=51, help='seed for the rng')
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
parser <- add_argument(parser, "--y_explained_variance", help='what proportion of the variance of y can be explained?', default=0.1)
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75)
parser <- add_argument(parser, "--accuracy_imbalance_difference", help='how much more accurate is the predictive model for one class than the other?', default=0.3)
parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
parser <- add_argument(parser, "--Bxy", help='Effect of z on y', default=0.3)
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*z*x")
parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
parser <- add_argument(parser, "--x_bias", help='coefficient of x on the probability a classification is correct', default=0)
parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
args <- parse_args(parser)
B0 <- 0
Px <- args$Px
Bxy <- args$Bxy
Bzy <- args$Bzy
Bzx <- args$Bzx
if(args$m < args$N){
df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$y_explained_variance, args$prediction_accuracy, y_bias=args$y_bias)
## df.pc <- df[,.(x,y,z,w_pred,w)]
## # df.pc <- df.pc[,err:=x-w_pred]
## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
## plot(pc.df)
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='')
research_data <- df[is.na(x.obs), .(w = w_pred, a = a_pred, y = y)] |> as.data.frame()
val_data <- df[!is.na(x.obs), .(w = w_pred, a = a_pred, z = z.obs, y = y, x = x.obs)] |> as.data.frame()
outline <- run_simulation(y ~ x || w + z ||
a, w ~ x + z + y, a ~ x + z + y, data=research_data,data2=val_data, result = result)
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
if(file.exists(args$outfile)){
logdata <- read_feather(args$outfile)
logdata <- rbind(logdata,as.data.table(outline), fill=TRUE)
} else {
logdata <- as.data.table(outline)
}
print(outline)
write_feather(logdata, args$outfile)
unlock(outfile_lock)
}

View File

@ -0,0 +1,41 @@
.ONESHELL:
SHELL=bash
Ns=[1000, 5000, 10000]
ms=[100, 200, 400]
seeds=[$(shell seq -s, 1 500)]
explained_variances=[0.1]
all:main
main:remembr.RDS
srun=sbatch --wait --verbose run_job.sbatch
skx_pylauncher_limit=40
joblists:example_1_jobs example_2_jobs example_3_jobs
grid_sweep_script=../simulations/grid_sweep.py
example_1_jobs: 01_indep_differential.R simulation_base.R ${grid_sweep_script} skx_1.sbatch ../misclassificationmodels
${srun} ${grid_sweep_script} --command "Rscript 01_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs
example_1.feather: example_1_jobs my_pylauncher.py
sbatch -J "multiple iv measerr correction" skx_1.sbatch my_pylauncher.py $< --cores 1
remembr.RDS:example_1.feather
# rm -f remembr.RDS
# ${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1"
clean_main:
rm -f example_1_jobs
rm -f example_1.feather
#
clean_all: clean_main
rm *.feather
rm -f remembr.RDS
rm -f remembr*.RDS
rm -f robustness*.RDS
rm -f example_*_jobs
rm -f robustness_*_jobs_*

View File

@ -0,0 +1,18 @@
#!/scratch/projects/compilers/intel24.0/oneapi/intelpython/python3.9/bin/python3
#^ always use TACC's python to start pylauncher.
import pylauncher
import argparse
import sys
parser = argparse.ArgumentParser()
parser.add_argument('jobs', help='list of commands to be run')
parser.add_argument('--cores', help='cores to use per job', default=1)
parser.add_argument('--timeout', help='timeout length (seconds)', default=2)
parser.add_argument('--queuestate', help='pylauncher queuestate (for resuming jobs)',default=None)
args = parser.parse_args()
if args.queuestate is not None:
pylauncher.ResumeClassicLauncher(args.queuestate, cores=args.cores, timeout=args.timeout)
else:
pylauncher.ClassicLauncher(args.jobs, cores=args.cores, timeout=args.timeout)

View File

@ -0,0 +1,118 @@
options(amelia.parallel="no",
amelia.ncpus=1)
library(matrixStats) # for numerically stable logsumexps
source("../misclassificationmodels/R/likelihoods.R")
source("../misclassificationmodels/R/glm_fixit.R")
## ... are the parts of the formular.
run_simulation <- function(..., family=gaussian(),
proxy_family = binomial(link='logit'),
truth_family = binomial(link='logit'),
data,
data2,
result
){
accuracy <- df[,mean(w_pred==x)]
accuracy.x.y0 <- df[y<=0,mean(w_pred==x)]
accuracy.x.y1 <- df[y>=0,mean(w_pred==x)]
accuracy.z.y0 <- df[y<=0,mean(a_pred==z)]
accuracy.z.y1 <- df[y>=0,mean(a_pred==z)]
cor.y.xi <- cor(df$x - df$w_pred, df$y)
cor.y.zi <- cor(df$z - df$a_pred, df$y)
fnr <- df[w_pred==0,mean(w_pred!=x)]
fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)]
fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)]
fpr <- df[w_pred==1,mean(w_pred!=x)]
fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)]
fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)]
cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred)
result <- append(result, list(accuracy=accuracy,
accuracy.x.y0=accuracy.x.y0,
accuracy.x.y1=accuracy.x.y1,
accuracy.z.y0=accuracy.z.y0,
accuracy.z.y1=accuracy.z.y1,
cor.y.xi=cor.y.xi,
fnr=fnr,
fnr.y0=fnr.y0,
fnr.y1=fnr.y1,
fpr=fpr,
fpr.y0=fpr.y0,
fpr.y1=fpr.y1,
cor.resid.w_pred=cor.resid.w_pred
))
result <- append(result, list(cor.xz=cor(df$x,df$z)))
(model.true <- lm(y ~ x + z, data=df))
true.ci.Bxy <- confint(model.true)['x',]
true.ci.Bzy <- confint(model.true)['z',]
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
Bzy.est.true=coef(model.true)['z'],
Bxy.ci.upper.true = true.ci.Bxy[2],
Bxy.ci.lower.true = true.ci.Bxy[1],
Bzy.ci.upper.true = true.ci.Bzy[2],
Bzy.ci.lower.true = true.ci.Bzy[1]))
## mle_result <- list(
## Bxy.est.naive = NULL,
## Bxy.ci.upper.naive = NULL,
## Bxy.ci.lower.naive = NULL,
## Bzy.est.naive = NULL,
## Bzy.ci.upper.naive = NULL,
## Bzy.ci.lower.naive = NULL,
## Bxy.est.feasible = NULL,
## Bxy.ci.upper.feasible = NULL,
## Bxy.ci.lower.feasible = NULL,
## Bzy.est.feasible = NULL,
## Bzy.ci.upper.feasible = NULL,
## Bzy.ci.lower.feasible = NULL,
## Bxy.est.mle = NULL,
## Bxy.ci.upper.mle = NULL,
## Bxy.ci.lower.mle = NULL,
## Bzy.est.mle = NULL,
## Bzy.ci.upper.mle = NULL,
## Bzy.ci.lower.mle = NULL
## )
## tryCatch({
mod <- glm_fixit(..., family=family, data=data, data2=data2)
coef_corrected <- coef(mod, which_model = "corrected")
ci_corrected <- confint(mod, which_model = "corrected")
coef_feasible <- coef(mod, which_model = "feasible")
ci_feasible <- confint(mod, which_model = "feasible")
coef_naive <- coef(mod, which_model = "naive")
ci_naive <- confint(mod, which_model = "naive")
mle_result <- list(
Bxy.est.mle = coef_corrected["x"],
Bxy.ci.upper.mle = ci_corrected["x", 2],
Bxy.ci.lower.mle = ci_corrected["x", 1],
Bzy.est.mle = coef_corrected["z"],
Bzy.ci.upper.mle = ci_corrected["z", 2],
Bzy.ci.lower.mle = ci_corrected["z", 1],
Bxy.est.feasible = coef_feasible["x"],
Bxy.ci.upper.feasible = ci_feasible["x", 2],
Bxy.ci.lower.feasible = ci_feasible["x",1],
Bzy.est.feasible = coef_feasible["z"],
Bzy.ci.upper.feasible = ci_feasible["z",2],
Bzy.ci.lower.feasible = ci_feasible["z",1],
Bxy.est.naive = coef_naive["w"],
Bxy.ci.upper.naive = ci_naive["w",2],
Bxy.ci.lower.naive = ci_naive["w",1],
Bzy.est.naive = coef_naive["a"],
Bzy.ci.upper.naive = ci_naive["a",2],
Bzy.ci.lower.naive = ci_naive["a",1]
)
## print(mle_result)
## },
## error=function(e) {result[['error']] <- as.character(e)
## })
result <- append(result, mle_result)
return(result)
}

View File

@ -0,0 +1,12 @@
#!/bin/bash
#SBATCH --account=ECS24013
#SBATCH --partition=skx
#SBATCH --nodes=1
#SBATCH --chdir=/scratch/10114/nathante/critical_mass
#SBATCH --error="sbatch_log/%x_%A_%a.err"
#SBATCH --output="sbatch_log/%x_%A_%a.out"
#SBATCH --nice
#SBATCH --time=48:00:00
source ~/.bashrc
"$@"