Compare commits
16 Commits
synced/mas
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 214551f74c | |||
| bb6f5e4731 | |||
| d9d3e47a44 | |||
| 6b410335f8 | |||
| ce6d12d4e6 | |||
| c1dbbfd0dd | |||
| 69948cae1e | |||
| acb119418a | |||
| b8d2048cc5 | |||
| c45ea9dfeb | |||
| c066f900d3 | |||
| fa05dbab6b | |||
| d8bc08f18f | |||
| 8ac33c14d7 | |||
| 82fe7b0f48 | |||
| 3d1964b806 |
6
.gitmodules
vendored
6
.gitmodules
vendored
@@ -1,3 +1,9 @@
|
|||||||
[submodule "paper"]
|
[submodule "paper"]
|
||||||
path = paper
|
path = paper
|
||||||
url = git@github.com:chainsawriot/measure.git
|
url = git@github.com:chainsawriot/measure.git
|
||||||
|
[submodule "overleaf"]
|
||||||
|
path = overleaf
|
||||||
|
url = https://git.overleaf.com/62a956eb9b9254783cc84c82
|
||||||
|
[submodule "misclassificationmodels"]
|
||||||
|
path = misclassificationmodels
|
||||||
|
url = https://github.com/chainsawriot/misclassificationmodels.git
|
||||||
|
|||||||
3
Makefile
Normal file
3
Makefile
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
all:
|
||||||
|
+$(MAKE) -C simulations
|
||||||
|
+$(MAKE) -C civil_comments
|
||||||
89
civil_comments/01_dv_example.R
Normal file
89
civil_comments/01_dv_example.R
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
source('load_perspective_data.R')
|
||||||
|
source("../simulations/measerr_methods.R")
|
||||||
|
source("../simulations/RemembR/R/RemembeR.R")
|
||||||
|
|
||||||
|
change.remember.file("dv_perspective_example.RDS")
|
||||||
|
remember(accuracies, "civil_comments_accuracies")
|
||||||
|
remember(f1s, "civil_comments_f1s")
|
||||||
|
remember(positive_cases, "civil_comments_positive_cases")
|
||||||
|
remember(proportions_cases, "civil_comments_proportions_cases")
|
||||||
|
remember(cortab, "civil_comments_cortab")
|
||||||
|
remember(nrow(df), 'n.annotated.comments')
|
||||||
|
# for reproducibility
|
||||||
|
set.seed(111)
|
||||||
|
|
||||||
|
## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white?
|
||||||
|
|
||||||
|
compare_dv_models <-function(pred_formula, outcome_formula, proxy_formula, df, sample.prop, sample.size, remember_prefix){
|
||||||
|
if(is.null(sample.prop)){
|
||||||
|
sample.prop <- sample.size / nrow(df)
|
||||||
|
}
|
||||||
|
if(is.null(sample.size)){
|
||||||
|
sample.size <- nrow(df) * sample.prop
|
||||||
|
}
|
||||||
|
|
||||||
|
pred_model <- glm(pred_formula, df, family=binomial(link='logit'))
|
||||||
|
|
||||||
|
remember(sample.size, paste0(remember_prefix, "sample.size"))
|
||||||
|
remember(sample.prop, paste0(remember_prefix, "sample.prop"))
|
||||||
|
remember(pred_formula, paste0(remember_prefix, "pred_formula"))
|
||||||
|
remember(outcome_formula, paste0(remember_prefix, 'outcome_formula'))
|
||||||
|
remember(proxy_formula, paste0(remember_prefix, 'proxy_formula'))
|
||||||
|
|
||||||
|
remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model"))
|
||||||
|
remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model"))
|
||||||
|
|
||||||
|
coder_model <- glm(outcome_formula, df, family=binomial(link='logit'))
|
||||||
|
remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model"))
|
||||||
|
remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model"))
|
||||||
|
|
||||||
|
df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded]
|
||||||
|
df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1]
|
||||||
|
sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit'))
|
||||||
|
remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model"))
|
||||||
|
remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model"))
|
||||||
|
|
||||||
|
measerr_model <- measerr_mle_dv(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit'))
|
||||||
|
|
||||||
|
inv_hessian = solve(measerr_model$hessian)
|
||||||
|
stderr = diag(inv_hessian)
|
||||||
|
remember(stderr, paste0(remember_prefix, "measerr_model_stderr"))
|
||||||
|
remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par"))
|
||||||
|
}
|
||||||
|
|
||||||
|
print("running first example")
|
||||||
|
|
||||||
|
pred_formula = toxicity_pred ~ likes + race_disclosed
|
||||||
|
outcome_formula = toxicity_coded ~ likes + race_disclosed
|
||||||
|
proxy_formula = toxicity_pred ~ toxicity_coded*race_disclosed*likes
|
||||||
|
|
||||||
|
compare_dv_models(pred_formula = pred_formula,
|
||||||
|
outcome_formula = outcome_formula,
|
||||||
|
proxy_formula = proxy_formula,
|
||||||
|
df=df,
|
||||||
|
sample.prop=0.01,
|
||||||
|
sample.size=NULL,
|
||||||
|
remember_prefix='cc_ex_tox.likes.race_disclosed')
|
||||||
|
|
||||||
|
|
||||||
|
print("running second example")
|
||||||
|
|
||||||
|
compare_dv_models(pred_formula = pred_formula,
|
||||||
|
outcome_formula = outcome_formula,
|
||||||
|
proxy_formula = proxy_formula,
|
||||||
|
df=df,
|
||||||
|
sample.size=10000,
|
||||||
|
sample.prop=NULL,
|
||||||
|
remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp')
|
||||||
|
|
||||||
|
|
||||||
|
print("running third example")
|
||||||
|
|
||||||
|
compare_dv_models(pred_formula = pred_formula,
|
||||||
|
outcome_formula = outcome_formula,
|
||||||
|
proxy_formula = proxy_formula,
|
||||||
|
df=df,
|
||||||
|
sample.prop=0.05,
|
||||||
|
sample.size=NULL,
|
||||||
|
remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp')
|
||||||
|
|
||||||
107
civil_comments/02_iv_example.R
Normal file
107
civil_comments/02_iv_example.R
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
source("../simulations/RemembR/R/RemembeR.R")
|
||||||
|
change.remember.file("iv_perspective_example.RDS")
|
||||||
|
|
||||||
|
source('load_perspective_data.R')
|
||||||
|
source("../simulations/measerr_methods.R")
|
||||||
|
|
||||||
|
remember(accuracies, "civil_comments_accuracies")
|
||||||
|
remember(f1s, "civil_comments_f1s")
|
||||||
|
remember(positive_cases, "civil_comments_positive_cases")
|
||||||
|
remember(proportions_cases, "civil_comments_proportions_cases")
|
||||||
|
remember(cortab, "civil_comments_cortab")
|
||||||
|
remember(nrow(df), 'n.annotated.comments')
|
||||||
|
# for reproducibility
|
||||||
|
set.seed(1)
|
||||||
|
|
||||||
|
## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white?
|
||||||
|
|
||||||
|
compare_iv_models <-function(pred_formula, outcome_formula, proxy_formula, truth_formula, df, sample.prop, sample.size, remember_prefix){
|
||||||
|
|
||||||
|
if(is.null(sample.prop)){
|
||||||
|
sample.prop <- sample.size / nrow(df)
|
||||||
|
}
|
||||||
|
if(is.null(sample.size)){
|
||||||
|
sample.size <- nrow(df) * sample.prop
|
||||||
|
}
|
||||||
|
|
||||||
|
remember(sample.size, paste0(remember_prefix, "sample.size"))
|
||||||
|
remember(sample.prop, paste0(remember_prefix, "sample.prop"))
|
||||||
|
remember(pred_formula, paste0(remember_prefix, "pred_formula"))
|
||||||
|
remember(outcome_formula, paste0(remember_prefix, 'outcome_formula'))
|
||||||
|
remember(proxy_formula, paste0(remember_prefix, 'proxy_formula'))
|
||||||
|
remember(truth_formula, paste0(remember_prefix, 'truth_formula'))
|
||||||
|
|
||||||
|
pred_model <- glm(pred_formula, df, family=binomial(link='logit'))
|
||||||
|
remember(coef(pred_model), paste0(remember_prefix, "coef_pred_model"))
|
||||||
|
remember(diag(vcov((pred_model))), paste0(remember_prefix, "se_pred_model"))
|
||||||
|
|
||||||
|
coder_model <- glm(outcome_formula, df, family=binomial(link='logit'))
|
||||||
|
remember(coef(coder_model), paste0(remember_prefix, "coef_coder_model"))
|
||||||
|
remember(diag(vcov((coder_model))), paste0(remember_prefix, "se_coder_model"))
|
||||||
|
|
||||||
|
df_measerr_method <- copy(df)[sample(1:.N, sample.size), toxicity_coded_1 := toxicity_coded]
|
||||||
|
df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1]
|
||||||
|
sample_model <- glm(outcome_formula, df_measerr_method, family=binomial(link='logit'))
|
||||||
|
remember(coef(sample_model), paste0(remember_prefix, "coef_sample_model"))
|
||||||
|
remember(diag(vcov((sample_model))), paste0(remember_prefix, "se_sample_model"))
|
||||||
|
|
||||||
|
measerr_model <- measerr_mle(df_measerr_method, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula=proxy_formula, proxy_family=binomial(link='logit'),truth_formula=truth_formula, truth_family=binomial(link='logit'))
|
||||||
|
|
||||||
|
inv_hessian = solve(measerr_model$hessian)
|
||||||
|
stderr = diag(inv_hessian)
|
||||||
|
remember(stderr, paste0(remember_prefix, "measerr_model_stderr"))
|
||||||
|
remember(measerr_model$par, paste0(remember_prefix, "measerr_model_par"))
|
||||||
|
}
|
||||||
|
|
||||||
|
## print("running first iv example")
|
||||||
|
|
||||||
|
## sample.prop <- 0.05
|
||||||
|
|
||||||
|
## compare_iv_models(white ~ toxicity_pred*funny,
|
||||||
|
## outcome_formula = white ~ toxicity_coded*funny,
|
||||||
|
## proxy_formula = toxicity_pred ~ toxicity_coded*funny*white,
|
||||||
|
## truth_formula = toxicity_coded ~ 1,
|
||||||
|
## df=df,
|
||||||
|
## sample.prop=sample.prop,
|
||||||
|
## remember_prefix='cc_ex_tox.funny.white')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pred_formula <- race_disclosed ~ likes * toxicity_pred
|
||||||
|
outcome_formula <- race_disclosed ~ likes * toxicity_coded
|
||||||
|
proxy_formula <- toxicity_pred ~ toxicity_coded * race_disclosed * likes
|
||||||
|
truth_formula <- toxicity_coded ~ 1
|
||||||
|
|
||||||
|
print("running first example")
|
||||||
|
|
||||||
|
compare_iv_models(pred_formula = pred_formula,
|
||||||
|
outcome_formula = outcome_formula,
|
||||||
|
proxy_formula = proxy_formula,
|
||||||
|
truth_formula = truth_formula,
|
||||||
|
df=df,
|
||||||
|
sample.prop=0.01,
|
||||||
|
sample.size=NULL,
|
||||||
|
remember_prefix='cc_ex_tox.likes.race_disclosed')
|
||||||
|
|
||||||
|
print("running second example")
|
||||||
|
|
||||||
|
compare_iv_models(pred_formula = pred_formula,
|
||||||
|
outcome_formula = outcome_formula,
|
||||||
|
proxy_formula = proxy_formula,
|
||||||
|
truth_formula = truth_formula,
|
||||||
|
df=df,
|
||||||
|
sample.prop=NULL,
|
||||||
|
sample.size=10000,
|
||||||
|
remember_prefix='cc_ex_tox.likes.race_disclosed.medsamp')
|
||||||
|
|
||||||
|
print("running third example")
|
||||||
|
|
||||||
|
compare_iv_models(pred_formula = race_disclosed ~ likes * toxicity_pred,
|
||||||
|
outcome_formula = race_disclosed ~ likes * toxicity_coded,
|
||||||
|
proxy_formula = toxicity_pred ~ toxicity_coded + race_disclosed,
|
||||||
|
truth_formula = toxicity_coded ~ 1,
|
||||||
|
df=df,
|
||||||
|
sample.prop=0.05,
|
||||||
|
sample.size=NULL,
|
||||||
|
remember_prefix='cc_ex_tox.likes.race_disclosed.largesamp')
|
||||||
|
|
||||||
21
civil_comments/03_prob_not_pred.R
Normal file
21
civil_comments/03_prob_not_pred.R
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
source('load_perspective_data.R')
|
||||||
|
source("../simulations/RemembR/R/RemembeR.R")
|
||||||
|
library(xtable)
|
||||||
|
change.remember.file("prob_not_pred.RDS")
|
||||||
|
### to respond to the reviewer show what happens if we don't recode the predictions.
|
||||||
|
|
||||||
|
non_recoded_dv <- lm(toxicity_prob ~ likes * race_disclosed, data=df)
|
||||||
|
remember(coef(non_recoded_dv), "coef_dv")
|
||||||
|
remember(diag(vcov(non_recoded_dv)), "se_dv")
|
||||||
|
remember(xtable(non_recoded_dv),'dv_xtable')
|
||||||
|
|
||||||
|
non_recoded_iv <- glm(race_disclosed ~ likes * toxicity_prob, data=df, family='binomial')
|
||||||
|
remember(coef(non_recoded_iv), "coef_iv")
|
||||||
|
remember(diag(vcov(non_recoded_iv)), "se_iv")
|
||||||
|
remember(xtable(non_recoded_iv),'iv_xtable')
|
||||||
|
|
||||||
|
remember(extract(non_recoded_iv,include.aic=F,include.bic=F,include.nobs=F,include.deviance=F,include.loglik=F),'non_recoded_iv')
|
||||||
|
remember(extract(non_recoded_dv,include.rsquared=F,include.adjrs=F,include.nobs=F),'non_recoded_dv')
|
||||||
|
tr <- texreg(list(r$non_recoded_iv, r$non_recoded_dv),custom.model.names=c("Example 1","Example 2"),custom.coef.map=list("(Intercept)"="Intercept","race_disclosedTRUE"="Identity Disclosure","toxicity_prob"="Toxicity Score","likes"="Likes","likes:race_disclosedTRUE"="Likes:Identity Disclosure","likes:toxicity_prob"="Likes:Toxicity Score"),single.row=T,dcolumn=T)
|
||||||
|
print(tr)
|
||||||
|
remember(tr, 'texregobj')
|
||||||
14
civil_comments/Makefile
Normal file
14
civil_comments/Makefile
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
qall: iv_perspective_example.RDS dv_perspective_example.RDS
|
||||||
|
|
||||||
|
srun_1core=srun -A comdata -p compute-bigmem --mem=4G --time=12:00:00 -c 1 --pty /usr/bin/bash -l
|
||||||
|
srun=srun -A comdata -p compute-bigmem --mem=4G --time=12:00:00 --pty /usr/bin/bash -l
|
||||||
|
|
||||||
|
perspective_scores.csv: perspective_json_to_csv.sh perspective_results.json
|
||||||
|
$(srun_1core) ./$^ $@
|
||||||
|
|
||||||
|
iv_perspective_example.RDS: 02_iv_example.R perspective_scores.csv
|
||||||
|
$(srun) Rscript $<
|
||||||
|
|
||||||
|
dv_perspective_example.RDS: 01_dv_example.R perspective_scores.csv
|
||||||
|
$(srun) Rscript $<
|
||||||
|
|
||||||
72
civil_comments/design_example.R
Normal file
72
civil_comments/design_example.R
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
set.seed(1111)
|
||||||
|
source('load_perspective_data.R')
|
||||||
|
## how accurate are the classifiers?
|
||||||
|
|
||||||
|
## the API claims that these scores are "probabilities"
|
||||||
|
## say we care about the model of the classification, not the probability
|
||||||
|
|
||||||
|
## toxicity error is weakly correlated pearson's R = 0.1 with both "white" and "black".
|
||||||
|
|
||||||
|
## compare regressions with "white" or "black" as the outcome and "toxicity_coded" or "toxicity_pred" as a predictor.
|
||||||
|
## here's a great example with presumambly non-differential error: about what identities is toxicity found humorous?
|
||||||
|
## a bunch of stars reappear when you used the ground-truth data instead of the predictions.
|
||||||
|
## pro/con of this example: will have to implement family='poisson'.
|
||||||
|
## shouldn't be that bad though haha.
|
||||||
|
cortab['toxicity_error',]
|
||||||
|
cortab['toxicity_error','funny']
|
||||||
|
cortab['toxicity_coded',]
|
||||||
|
cortab['identity_error',]
|
||||||
|
cortab['white',]
|
||||||
|
|
||||||
|
## here's a simple example, is P(white | toxic and mentally ill) > P(white | toxic or mentally ill). Are people who discuss their mental illness in a toxic way more likely to be white compared to those who just talk about their mental illness or are toxic?
|
||||||
|
summary(glm(white ~ toxicity_coded*psychiatric_or_mental_illness, data = df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
summary(glm(white ~ toxicity_pred*psychiatric_or_mental_illness, data = df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
summary(glm(white ~ toxicity_coded*male, data = df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
summary(glm(white ~ toxicity_pred*male, data = df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
summary(glm(toxicity_coded ~ white*psychiatric_or_mental_illness, data = df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
summary(glm(toxicity_pred ~ white*psychiatric_or_mental_illness, data = df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
|
||||||
|
## another simple enough example: is P(toxic | funny and white) > P(toxic | funny nand white)? Or, are funny comments more toxic when people disclose that they are white?
|
||||||
|
|
||||||
|
summary(glm(toxicity_pred ~ funny*white, data=df, family=binomial(link='logit')))
|
||||||
|
summary(glm(toxicity_coded ~ funny*white, data=df, family=binomial(link='logit')))
|
||||||
|
|
||||||
|
source("../simulations/measerr_methods.R")
|
||||||
|
|
||||||
|
saved_model_file <- "measerr_model_tox.eq.funny.cross.white.RDS"
|
||||||
|
overwrite_model <- TRUE
|
||||||
|
|
||||||
|
# it works so far with a 20% and 15% sample. Smaller is better. let's try a 10% sample again. It didn't work out. We'll go forward with a 15% sample.
|
||||||
|
df_measerr_method <- copy(df)[sample(1:.N, 0.05 * .N), toxicity_coded_1 := toxicity_coded]
|
||||||
|
df_measerr_method <- df_measerr_method[,toxicity_coded := toxicity_coded_1]
|
||||||
|
summary(glm(toxicity_coded ~ funny*white, data=df_measerr_method[!is.na(toxicity_coded)], family=binomial(link='logit')))
|
||||||
|
|
||||||
|
if(!file.exists(saved_model_file) || (overwrite_model == TRUE)){
|
||||||
|
measerr_model <- measerr_mle_dv(df_measerr_method,toxicity_coded ~ funny*white,outcome_family=binomial(link='logit'), proxy_formula=toxicity_pred ~ toxicity_coded*funny*white)
|
||||||
|
saveRDS(measerr_model, saved_model_file)
|
||||||
|
} else {
|
||||||
|
measerr_model <- readRDS(saved_model_file)
|
||||||
|
}
|
||||||
|
|
||||||
|
inv_hessian <- solve(measerr_model$hessian)
|
||||||
|
se <- diag(inv_hessian)
|
||||||
|
|
||||||
|
lm2 <- glm.nb(funny ~ (male + female + transgender + other_gender + heterosexual + bisexual + other_sexual_orientation + christian + jewish + hindu + buddhist + atheist + other_religion + asian + latino + other_race_or_ethnicity + physical_disability + intellectual_or_learning_disability + white + black + psychiatric_or_mental_illness)*toxicity_pred, data = df)
|
||||||
|
m3 <- glm.nb(funny ~ (male + female + transgender + other_gender + heterosexual + bisexual + other_sexual_orientation + christian + jewish + hindu + buddhist + atheist + other_religion + asian + latino + other_race_or_ethnicity + physical_disability + intellectual_or_learning_disability + white + black + psychiatric_or_mental_illness)*toxicity, data = df)
|
||||||
|
|
||||||
|
|
||||||
|
glm(white ~ disagree, data = df, family=binomial(link='logit'))
|
||||||
|
|
||||||
|
## example with differential error
|
||||||
|
|
||||||
|
glm(white ~ toxicity_coded + toxicity_error, data=df,family=binomial(link='logit'))
|
||||||
|
|
||||||
|
glm(toxicity_coded ~ white, data = df, family=binomial(link='logit'))
|
||||||
|
|
||||||
|
glm(toxicity_pred ~ white, data = df, family=binomial(link='logit'))
|
||||||
136
civil_comments/load_perspective_data.R
Normal file
136
civil_comments/load_perspective_data.R
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
library(data.table)
|
||||||
|
library(MASS)
|
||||||
|
|
||||||
|
set.seed(1111)
|
||||||
|
|
||||||
|
scores <- fread("perspective_scores.csv")
|
||||||
|
scores <- scores[,id:=as.character(id)]
|
||||||
|
|
||||||
|
df <- fread("all_data.csv")
|
||||||
|
|
||||||
|
# only use the data that has identity annotations
|
||||||
|
df <- df[identity_annotator_count > 0]
|
||||||
|
|
||||||
|
(df[!(df$id %in% scores$id)])
|
||||||
|
|
||||||
|
df <- df[scores,on='id',nomatch=NULL]
|
||||||
|
|
||||||
|
df[, ":="(identity_attack_pred = identity_attack_prob >=0.5,
|
||||||
|
insult_pred = insult_prob >= 0.5,
|
||||||
|
profanity_pred = profanity_prob >= 0.5,
|
||||||
|
severe_toxicity_pred = severe_toxicity_prob >= 0.5,
|
||||||
|
threat_pred = threat_prob >= 0.5,
|
||||||
|
toxicity_pred = toxicity_prob >= 0.5,
|
||||||
|
identity_attack_coded = identity_attack >= 0.5,
|
||||||
|
insult_coded = insult >= 0.5,
|
||||||
|
profanity_coded = obscene >= 0.5,
|
||||||
|
severe_toxicity_coded = severe_toxicity >= 0.5,
|
||||||
|
threat_coded = threat >= 0.5,
|
||||||
|
toxicity_coded = toxicity >= 0.5
|
||||||
|
)]
|
||||||
|
|
||||||
|
gt.0.5 <- function(v) { v >= 0.5 }
|
||||||
|
dt.apply.any <- function(fun, ...){apply(apply(cbind(...), 2, fun),1,any)}
|
||||||
|
|
||||||
|
df <- df[,":="(gender_disclosed = dt.apply.any(gt.0.5, male, female, transgender, other_gender),
|
||||||
|
sexuality_disclosed = dt.apply.any(gt.0.5, heterosexual, bisexual, other_sexual_orientation),
|
||||||
|
religion_disclosed = dt.apply.any(gt.0.5, christian, jewish, hindu, buddhist, atheist, muslim, other_religion),
|
||||||
|
race_disclosed = dt.apply.any(gt.0.5, white, black, asian, latino, other_race_or_ethnicity),
|
||||||
|
disability_disclosed = dt.apply.any(gt.0.5,physical_disability, intellectual_or_learning_disability, psychiatric_or_mental_illness, other_disability))]
|
||||||
|
|
||||||
|
df <- df[,white:=gt.0.5(white)]
|
||||||
|
|
||||||
|
|
||||||
|
F1 <- function(y, predictions){
|
||||||
|
tp <- sum( (predictions == y) & (predictions==1))
|
||||||
|
fn <- sum( (predictions != y) & (predictions!=1))
|
||||||
|
fp <- sum( (predictions != y) & (predictions==1))
|
||||||
|
precision <- tp / (tp + fp)
|
||||||
|
recall <- tp / (tp + fn)
|
||||||
|
return (2 * precision * recall ) / (precision + recall)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
## toxicity is about 93% accurate, with an f1 of 0.8
|
||||||
|
## identity_attack has high accuracy 97%, but an unfortunant f1 of 0.5.
|
||||||
|
## threat has high accuracy 99%, but a really bad looking f1 of 0.48.
|
||||||
|
accuracies <- df[,.(identity_attack_acc = mean(identity_attack_pred == identity_attack_coded),
|
||||||
|
insult_pred_acc = mean(insult_pred == insult_coded),
|
||||||
|
profanity_acc = mean(profanity_pred == profanity_coded),
|
||||||
|
severe_toxicity_acc = mean(severe_toxicity_pred == severe_toxicity_coded),
|
||||||
|
theat_acc = mean(threat_pred == threat_coded),
|
||||||
|
toxicity_acc = mean(toxicity_pred == toxicity_coded))]
|
||||||
|
|
||||||
|
f1s <- df[,.(identity_attack_f1 = F1(identity_attack_coded,identity_attack_pred),
|
||||||
|
insult_f1 = F1(insult_coded,insult_pred),
|
||||||
|
profanity_f1 = F1(profanity_coded,profanity_pred),
|
||||||
|
severe_toxicity_f1 = F1(severe_toxicity_coded,severe_toxicity_pred),
|
||||||
|
theat_f1 = F1(threat_coded,threat_pred),
|
||||||
|
toxicity_f1 = F1(toxicity_coded,toxicity_pred))]
|
||||||
|
|
||||||
|
positive_cases <- df[,.(identity_attacks = sum(identity_attack_coded),
|
||||||
|
insults = sum(insult_coded),
|
||||||
|
profanities = sum(profanity_coded),
|
||||||
|
severe_toxic_comments = sum(severe_toxicity_coded),
|
||||||
|
threats = sum(threat_coded),
|
||||||
|
toxic_comments = sum(toxicity_coded))]
|
||||||
|
|
||||||
|
## there are 50,000 toxic comments, 13000 identity attacks, 30000 insults, 3000 profanities, 8 severe toxic, and 1000 threats.
|
||||||
|
|
||||||
|
proportions_cases <- df[,.(prop_identity = mean(identity_attack_coded),
|
||||||
|
prop_insults = mean(insult_coded),
|
||||||
|
prop_profanity = mean(profanity_coded),
|
||||||
|
prop_severe = mean(severe_toxicity_coded),
|
||||||
|
prop_threats = mean(threat_coded),
|
||||||
|
prop_toxic = mean(toxicity_coded))]
|
||||||
|
|
||||||
|
## at 11% of comments, "toxicity" seems not so badly skewed. Try toxicity first, and if it doesn't work out try insults.
|
||||||
|
|
||||||
|
## now look for an example where differential error affects an identity, or a reaction.
|
||||||
|
df <- df[,":="(identity_error = identity_attack_coded - identity_attack_pred,
|
||||||
|
insult_error = insult_coded - insult_pred,
|
||||||
|
profanity_error = profanity_coded - profanity_pred,
|
||||||
|
severe_toxic_error = severe_toxicity_coded - severe_toxicity_pred,
|
||||||
|
threat_error = threat_coded - threat_pred,
|
||||||
|
toxicity_error = toxicity_coded - toxicity_pred)]
|
||||||
|
|
||||||
|
## what's correlated with toxicity_error ?
|
||||||
|
df <- df[,approved := rating == "approved"]
|
||||||
|
df <- df[,white := white > 0.5]
|
||||||
|
|
||||||
|
cortab <- cor(df[,.(toxicity_error,
|
||||||
|
identity_error,
|
||||||
|
toxicity_coded,
|
||||||
|
funny,
|
||||||
|
approved,
|
||||||
|
sad,
|
||||||
|
wow,
|
||||||
|
likes,
|
||||||
|
disagree,
|
||||||
|
male,
|
||||||
|
female,
|
||||||
|
transgender,
|
||||||
|
other_gender,
|
||||||
|
heterosexual,
|
||||||
|
bisexual,
|
||||||
|
other_sexual_orientation,
|
||||||
|
christian,
|
||||||
|
jewish,
|
||||||
|
hindu,
|
||||||
|
buddhist,
|
||||||
|
atheist,
|
||||||
|
other_religion,
|
||||||
|
black,
|
||||||
|
white,
|
||||||
|
asian,
|
||||||
|
latino,
|
||||||
|
other_race_or_ethnicity,
|
||||||
|
physical_disability,
|
||||||
|
intellectual_or_learning_disability,
|
||||||
|
psychiatric_or_mental_illness,
|
||||||
|
other_disability,
|
||||||
|
gender_disclosed,
|
||||||
|
sexuality_disclosed,
|
||||||
|
religion_disclosed,
|
||||||
|
race_disclosed,
|
||||||
|
disability_disclosed)])
|
||||||
2
civil_comments/perspective_json_to_csv.jq
Executable file
2
civil_comments/perspective_json_to_csv.jq
Executable file
@@ -0,0 +1,2 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
cat $1 | jq "[.attributeScores.IDENTITY_ATTACK.summaryScore.value, .attributeScores.INSULT.summaryScore.value, .attributeScores.PROFANITY.summaryScore.value,.attributeScores.SEVERE_TOXICITY.summaryScore.value, .attributeScores.THREAT.summaryScore.value, .attributeScores.TOXICITY.summaryScore.value] | @csv" > $2
|
||||||
4
civil_comments/perspective_json_to_csv.sh
Executable file
4
civil_comments/perspective_json_to_csv.sh
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/bash
|
||||||
|
header=id,identity_attack_prob,insult_prob,profanity_prob,severe_toxicity_prob,threat_prob,toxicity_prob
|
||||||
|
echo "$header" > $2
|
||||||
|
cat $1 | jq -r '[.id, .attributeScores.IDENTITY_ATTACK.summaryScore.value, .attributeScores.INSULT.summaryScore.value, .attributeScores.PROFANITY.summaryScore.value,.attributeScores.SEVERE_TOXICITY.summaryScore.value, .attributeScores.THREAT.summaryScore.value, .attributeScores.TOXICITY.summaryScore.value] | @csv' >> $2
|
||||||
1
misclassificationmodels
Submodule
1
misclassificationmodels
Submodule
Submodule misclassificationmodels added at 1dc39a1af5
1
overleaf
Submodule
1
overleaf
Submodule
Submodule overleaf added at c5e0a01713
@@ -30,11 +30,11 @@ source("simulation_base.R")
|
|||||||
#### how much power do we get from the model in the first place? (sweeping N and m)
|
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||||
####
|
####
|
||||||
|
|
||||||
simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, seed=1){
|
simulate_data <- function(N, m, B0=0, Bxy=0.2, Bzy=-0.2, Bzx=0.2, y_explained_variance=0.025, prediction_accuracy=0.73, Px=0.5, seed=1){
|
||||||
set.seed(seed)
|
set.seed(seed)
|
||||||
z <- rnorm(N,sd=0.5)
|
z <- rnorm(N,sd=0.5)
|
||||||
# x.var.epsilon <- var(Bzx *z) * ((1-zx_explained_variance)/zx_explained_variance)
|
# x.var.epsilon <- var(Bzx *z) * ((1-zx_explained_variance)/zx_explained_variance)
|
||||||
xprime <- Bzx * z #+ x.var.epsilon
|
xprime <- Bzx * z + qlogis(Px)
|
||||||
x <- rbinom(N,1,plogis(xprime))
|
x <- rbinom(N,1,plogis(xprime))
|
||||||
|
|
||||||
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bxy*x,Bzy*z)) * ((1-y_explained_variance)/y_explained_variance)
|
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bxy*x,Bzy*z)) * ((1-y_explained_variance)/y_explained_variance)
|
||||||
@@ -78,18 +78,21 @@ parser <- add_argument(parser, "--truth_formula", help='formula for the true var
|
|||||||
parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
|
parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
|
||||||
parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
|
parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
|
||||||
parser <- add_argument(parser, "--Bxy", help='Effect of x on y', default=0.3)
|
parser <- add_argument(parser, "--Bxy", help='Effect of x on y', default=0.3)
|
||||||
|
parser <- add_argument(parser, "--Px", help='Base rate of x', default=0.5)
|
||||||
|
parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
|
||||||
|
|
||||||
args <- parse_args(parser)
|
args <- parse_args(parser)
|
||||||
B0 <- 0
|
B0 <- 0
|
||||||
|
Px <- args$Px
|
||||||
Bxy <- args$Bxy
|
Bxy <- args$Bxy
|
||||||
Bzy <- args$Bzy
|
Bzy <- args$Bzy
|
||||||
Bzx <- args$Bzx
|
Bzx <- args$Bzx
|
||||||
|
|
||||||
df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, seed=args$seed + 500, y_explained_variance = args$y_explained_variance, prediction_accuracy=args$prediction_accuracy)
|
df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, Px, seed=args$seed + 500, y_explained_variance = args$y_explained_variance, prediction_accuracy=args$prediction_accuracy)
|
||||||
|
|
||||||
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
|
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method,error='')
|
||||||
|
|
||||||
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula))
|
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula),confint_method=args$confint_method)
|
||||||
|
|
||||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||||
if(file.exists(args$outfile)){
|
if(file.exists(args$outfile)){
|
||||||
|
|||||||
@@ -31,11 +31,11 @@ source("simulation_base.R")
|
|||||||
|
|
||||||
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
||||||
## in other words, the model is missing an important feature of x.obs that's related to y.
|
## in other words, the model is missing an important feature of x.obs that's related to y.
|
||||||
simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,accuracy_imbalance_difference=0.3){
|
simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,Px=0.5,accuracy_imbalance_difference=0.3){
|
||||||
set.seed(seed)
|
set.seed(seed)
|
||||||
# make w and y dependent
|
# make w and y dependent
|
||||||
z <- rnorm(N,sd=0.5)
|
z <- rnorm(N,sd=0.5)
|
||||||
x <- rbinom(N, 1, plogis(Bzx * z))
|
x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
|
||||||
|
|
||||||
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance)
|
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance)
|
||||||
y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
|
y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
|
||||||
@@ -104,9 +104,10 @@ simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.
|
|||||||
## print(mean(df[z==1]$x == df[z==1]$w_pred))
|
## print(mean(df[z==1]$x == df[z==1]$w_pred))
|
||||||
## print(mean(df$w_pred == df$x))
|
## print(mean(df$w_pred == df$x))
|
||||||
|
|
||||||
|
|
||||||
resids <- resid(lm(y~x + z))
|
resids <- resid(lm(y~x + z))
|
||||||
odds.x1 <- qlogis(prediction_accuracy) + y_bias*qlogis(pnorm(resids[x==1])) + z_bias * qlogis(pnorm(z,sd(z)))
|
odds.x1 <- qlogis(prediction_accuracy) + y_bias*qlogis(pnorm(resids[x==1])) + z_bias * qlogis(pnorm(z[x==1],sd(z)))
|
||||||
odds.x0 <- qlogis(prediction_accuracy,lower.tail=F) + y_bias*qlogis(pnorm(resids[x==0])) + z_bias * qlogis(pnorm(z,sd(z)))
|
odds.x0 <- qlogis(prediction_accuracy,lower.tail=F) + y_bias*qlogis(pnorm(resids[x==0])) + z_bias * qlogis(pnorm(z[x==0],sd(z)))
|
||||||
|
|
||||||
## acc.x0 <- p.correct[df[,x==0]]
|
## acc.x0 <- p.correct[df[,x==0]]
|
||||||
## acc.x1 <- p.correct[df[,x==1]]
|
## acc.x1 <- p.correct[df[,x==1]]
|
||||||
@@ -139,10 +140,12 @@ parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy va
|
|||||||
parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
|
parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
|
||||||
parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
|
parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
|
||||||
parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
|
parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
|
||||||
|
parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
|
||||||
|
parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
|
||||||
args <- parse_args(parser)
|
args <- parse_args(parser)
|
||||||
|
|
||||||
B0 <- 0
|
B0 <- 0
|
||||||
|
Px <- args$Px
|
||||||
Bxy <- args$Bxy
|
Bxy <- args$Bxy
|
||||||
Bzy <- args$Bzy
|
Bzy <- args$Bzy
|
||||||
Bzx <- args$Bzx
|
Bzx <- args$Bzx
|
||||||
@@ -156,9 +159,9 @@ if(args$m < args$N){
|
|||||||
## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
|
## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
|
||||||
## plot(pc.df)
|
## plot(pc.df)
|
||||||
|
|
||||||
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, Bzx=args$Bzx, 'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, error='')
|
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='')
|
||||||
|
|
||||||
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula))
|
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula),confint_method=args$confint_method)
|
||||||
|
|
||||||
|
|
||||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||||
|
|||||||
@@ -73,15 +73,17 @@ parser <- add_argument(parser, "--y_explained_variance", help='what proportion o
|
|||||||
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.72)
|
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.72)
|
||||||
## parser <- add_argument(parser, "--x_bias_y1", help='how is the classifier biased when y = 1?', default=-0.75)
|
## parser <- add_argument(parser, "--x_bias_y1", help='how is the classifier biased when y = 1?', default=-0.75)
|
||||||
## parser <- add_argument(parser, "--x_bias_y0", help='how is the classifier biased when y = 0 ?', default=0.75)
|
## parser <- add_argument(parser, "--x_bias_y0", help='how is the classifier biased when y = 0 ?', default=0.75)
|
||||||
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.01)
|
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3)
|
||||||
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.01)
|
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3)
|
||||||
parser <- add_argument(parser, "--Bzx", help='coeffficient of z on x', default=-0.5)
|
parser <- add_argument(parser, "--Bzx", help='coeffficient of z on x', default=-0.5)
|
||||||
|
parser <- add_argument(parser, "--B0", help='Base rate of y', default=0.5)
|
||||||
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
||||||
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y")
|
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y")
|
||||||
|
parser <- add_argument(parser, "--confint_method", help='method for getting confidence intervals', default="quad")
|
||||||
|
|
||||||
args <- parse_args(parser)
|
args <- parse_args(parser)
|
||||||
|
|
||||||
B0 <- 0
|
B0 <- args$B0
|
||||||
Bxy <- args$Bxy
|
Bxy <- args$Bxy
|
||||||
Bzy <- args$Bzy
|
Bzy <- args$Bzy
|
||||||
Bzx <- args$Bzx
|
Bzx <- args$Bzx
|
||||||
@@ -90,9 +92,9 @@ if(args$m < args$N){
|
|||||||
df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, args$seed, args$prediction_accuracy)
|
df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, Bzx, args$seed, args$prediction_accuracy)
|
||||||
|
|
||||||
# result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'x_bias_y0'=args$x_bias_y0,'x_bias_y1'=args$x_bias_y1,'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula)
|
# result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'x_bias_y0'=args$x_bias_y0,'x_bias_y1'=args$x_bias_y1,'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula)
|
||||||
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy, 'Bzx'=Bzx,'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula)
|
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy, 'Bzx'=Bzx,'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula, 'confint_method'=args$confint_method)
|
||||||
|
|
||||||
outline <- run_simulation_depvar(df, result, outcome_formula = as.formula(args$outcome_formula), proxy_formula = as.formula(args$proxy_formula))
|
outline <- run_simulation_depvar(df, result, outcome_formula = as.formula(args$outcome_formula), proxy_formula = as.formula(args$proxy_formula), confint_method=args$confint_method)
|
||||||
|
|
||||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||||
|
|
||||||
|
|||||||
@@ -31,14 +31,14 @@ source("simulation_base.R")
|
|||||||
|
|
||||||
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
||||||
## in other words, the model is missing an important feature of x.obs that's related to y.
|
## in other words, the model is missing an important feature of x.obs that's related to y.
|
||||||
simulate_data <- function(N, m, B0, Bxy, Bzy, seed, prediction_accuracy=0.73, x_bias=-0.75){
|
simulate_data <- function(N, m, B0, Bxy, Bzy, Py, seed, prediction_accuracy=0.73, x_bias=-0.75){
|
||||||
set.seed(seed)
|
set.seed(seed)
|
||||||
|
|
||||||
# make w and y dependent
|
# make w and y dependent
|
||||||
z <- rbinom(N, 1, 0.5)
|
z <- rbinom(N, 1, 0.5)
|
||||||
x <- rbinom(N, 1, 0.5)
|
x <- rbinom(N, 1, 0.5)
|
||||||
|
|
||||||
ystar <- Bzy * z + Bxy * x + B0
|
ystar <- Bzy * z + Bxy * x + B0 + qlogix(Py)
|
||||||
y <- rbinom(N,1,plogis(ystar))
|
y <- rbinom(N,1,plogis(ystar))
|
||||||
|
|
||||||
# glm(y ~ x + z, family="binomial")
|
# glm(y ~ x + z, family="binomial")
|
||||||
@@ -77,6 +77,7 @@ parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is th
|
|||||||
parser <- add_argument(parser, "--x_bias", help='how is the classifier biased?', default=0.75)
|
parser <- add_argument(parser, "--x_bias", help='how is the classifier biased?', default=0.75)
|
||||||
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3)
|
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.3)
|
||||||
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3)
|
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.3)
|
||||||
|
parser <- add_argument(parser, "--Py", help='Base rate of y', default=0.5)
|
||||||
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
||||||
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*x")
|
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*x")
|
||||||
|
|
||||||
|
|||||||
185
simulations/03_indep_differential_nonnorm.R
Normal file
185
simulations/03_indep_differential_nonnorm.R
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
### EXAMPLE 1: demonstrates how measurement error can lead to a type sign error in a covariate
|
||||||
|
### What kind of data invalidates fong + tyler?
|
||||||
|
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
|
||||||
|
### Even when you include the proxy variable in the regression.
|
||||||
|
### But with some ground truth and multiple imputation, you can fix it.
|
||||||
|
|
||||||
|
library(argparser)
|
||||||
|
library(mecor)
|
||||||
|
library(ggplot2)
|
||||||
|
library(data.table)
|
||||||
|
library(filelock)
|
||||||
|
library(arrow)
|
||||||
|
library(Amelia)
|
||||||
|
library(Zelig)
|
||||||
|
library(predictionError)
|
||||||
|
options(amelia.parallel="no",
|
||||||
|
amelia.ncpus=1)
|
||||||
|
setDTthreads(40)
|
||||||
|
|
||||||
|
source("simulation_base.R")
|
||||||
|
|
||||||
|
## SETUP:
|
||||||
|
### we want to estimate x -> y; x is MAR
|
||||||
|
### we have x -> k; k -> w; x -> w is used to predict x via the model w.
|
||||||
|
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
|
||||||
|
### The labels x are binary, but the model provides a continuous predictor
|
||||||
|
|
||||||
|
### simulation:
|
||||||
|
#### how much power do we get from the model in the first place? (sweeping N and m)
|
||||||
|
####
|
||||||
|
|
||||||
|
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
||||||
|
## in other words, the model is missing an important feature of x.obs that's related to y.
|
||||||
|
simulate_data <- function(N, m, B0, Bxy, Bzx, Bzy, seed, y_explained_variance=0.025, prediction_accuracy=0.73, y_bias=-0.8,z_bias=0,Px=0.5,accuracy_imbalance_difference=0.3,sd_y_mixin=1){
|
||||||
|
set.seed(seed)
|
||||||
|
# make w and y dependent
|
||||||
|
z <- rnorm(N,sd=0.5)
|
||||||
|
x <- rbinom(N, 1, plogis(Bzx * z + qlogis(Px)))
|
||||||
|
## following Fong + Tyler: mix y with a Bernoulli(0.15) × |N (0, 20)| to make a skewed non-normal distribution
|
||||||
|
y.var.epsilon <- (var(Bzy * z) + var(Bxy *x) + 2*cov(Bzy*z,Bxy*x)) * ((1-y_explained_variance)/y_explained_variance)
|
||||||
|
y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon))
|
||||||
|
y <- Bzy * z + Bxy * x + y.epsilon + rbinom(N,1,0.15) * rnorm(N,0,sd_y_mixin)
|
||||||
|
|
||||||
|
df <- data.table(x=x,y=y,z=z)
|
||||||
|
|
||||||
|
if(m < N){
|
||||||
|
df <- df[sample(nrow(df), m), x.obs := x]
|
||||||
|
} else {
|
||||||
|
df <- df[, x.obs := x]
|
||||||
|
}
|
||||||
|
|
||||||
|
## probablity of an error is correlated with y
|
||||||
|
## pz <- mean(z)
|
||||||
|
## accuracy_imbalance_ratio <- (prediction_accuracy + accuracy_imbalance_difference/2) / (prediction_accuracy - accuracy_imbalance_difference/2)
|
||||||
|
|
||||||
|
## # this works because of conditional probability
|
||||||
|
## accuracy_z0 <- prediction_accuracy / (pz*(accuracy_imbalance_ratio) + (1-pz))
|
||||||
|
## accuracy_z1 <- accuracy_imbalance_ratio * accuracy_z0
|
||||||
|
|
||||||
|
## z0x0 <- df[(z==0) & (x==0)]$x
|
||||||
|
## z0x1 <- df[(z==0) & (x==1)]$x
|
||||||
|
## z1x0 <- df[(z==1) & (x==0)]$x
|
||||||
|
## z1x1 <- df[(z==1) & (x==1)]$x
|
||||||
|
|
||||||
|
## yz0x0 <- df[(z==0) & (x==0)]$y
|
||||||
|
## yz0x1 <- df[(z==0) & (x==1)]$y
|
||||||
|
## yz1x0 <- df[(z==1) & (x==0)]$y
|
||||||
|
## yz1x1 <- df[(z==1) & (x==1)]$y
|
||||||
|
|
||||||
|
## nz0x0 <- nrow(df[(z==0) & (x==0)])
|
||||||
|
## nz0x1 <- nrow(df[(z==0) & (x==1)])
|
||||||
|
## nz1x0 <- nrow(df[(z==1) & (x==0)])
|
||||||
|
## nz1x1 <- nrow(df[(z==1) & (x==1)])
|
||||||
|
|
||||||
|
## yz1 <- df[z==1]$y
|
||||||
|
## yz1 <- df[z==1]$y
|
||||||
|
|
||||||
|
## # tranform yz0.1 into a logistic distribution with mean accuracy_z0
|
||||||
|
## acc.z0x0 <- plogis(0.5*scale(yz0x0) + qlogis(accuracy_z0))
|
||||||
|
## acc.z0x1 <- plogis(0.5*scale(yz0x1) + qlogis(accuracy_z0))
|
||||||
|
## acc.z1x0 <- plogis(1.5*scale(yz1x0) + qlogis(accuracy_z1))
|
||||||
|
## acc.z1x1 <- plogis(1.5*scale(yz1x1) + qlogis(accuracy_z1))
|
||||||
|
|
||||||
|
## w0z0x0 <- (1-z0x0)**2 + (-1)**(1-z0x0) * acc.z0x0
|
||||||
|
## w0z0x1 <- (1-z0x1)**2 + (-1)**(1-z0x1) * acc.z0x1
|
||||||
|
## w0z1x0 <- (1-z1x0)**2 + (-1)**(1-z1x0) * acc.z1x0
|
||||||
|
## w0z1x1 <- (1-z1x1)**2 + (-1)**(1-z1x1) * acc.z1x1
|
||||||
|
|
||||||
|
## ##perrorz0 <- w0z0*(pyz0)
|
||||||
|
## ##perrorz1 <- w0z1*(pyz1)
|
||||||
|
|
||||||
|
## w0z0x0.noisy.odds <- rlogis(nz0x0,qlogis(w0z0x0))
|
||||||
|
## w0z0x1.noisy.odds <- rlogis(nz0x1,qlogis(w0z0x1))
|
||||||
|
## w0z1x0.noisy.odds <- rlogis(nz1x0,qlogis(w0z1x0))
|
||||||
|
## w0z1x1.noisy.odds <- rlogis(nz1x1,qlogis(w0z1x1))
|
||||||
|
|
||||||
|
## df[(z==0)&(x==0),w:=plogis(w0z0x0.noisy.odds)]
|
||||||
|
## df[(z==0)&(x==1),w:=plogis(w0z0x1.noisy.odds)]
|
||||||
|
## df[(z==1)&(x==0),w:=plogis(w0z1x0.noisy.odds)]
|
||||||
|
## df[(z==1)&(x==1),w:=plogis(w0z1x1.noisy.odds)]
|
||||||
|
|
||||||
|
## df[,w_pred:=as.integer(w > 0.5)]
|
||||||
|
## print(mean(df[z==0]$x == df[z==0]$w_pred))
|
||||||
|
## print(mean(df[z==1]$x == df[z==1]$w_pred))
|
||||||
|
## print(mean(df$w_pred == df$x))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
resids <- resid(lm(y~x + z))
|
||||||
|
odds.x1 <- qlogis(prediction_accuracy) + y_bias*qlogis(pnorm(resids[x==1],log.p=T),log.p=T) + z_bias * qlogis(pnorm(z[x==1],sd(z),log.p=T),log.p=T)
|
||||||
|
odds.x0 <- qlogis(prediction_accuracy,lower.tail=F) + y_bias*qlogis(pnorm(resids[x==0],log.p=T),log.p=T) + z_bias * qlogis(pnorm(z[x==0],sd(z),log.p=T),log.p=T)
|
||||||
|
|
||||||
|
## acc.x0 <- p.correct[df[,x==0]]
|
||||||
|
## acc.x1 <- p.correct[df[,x==1]]
|
||||||
|
|
||||||
|
df[x==0,w:=plogis(rlogis(.N,odds.x0))]
|
||||||
|
df[x==1,w:=plogis(rlogis(.N,odds.x1))]
|
||||||
|
|
||||||
|
print(prediction_accuracy)
|
||||||
|
print(resids[is.na(df$w)])
|
||||||
|
print(odds.x0[is.na(df$w)])
|
||||||
|
print(odds.x1[is.na(df$w)])
|
||||||
|
|
||||||
|
df[,w_pred := as.integer(w > 0.5)]
|
||||||
|
|
||||||
|
|
||||||
|
print(mean(df$w_pred == df$x))
|
||||||
|
print(mean(df[y>=0]$w_pred == df[y>=0]$x))
|
||||||
|
print(mean(df[y<=0]$w_pred == df[y<=0]$x))
|
||||||
|
return(df)
|
||||||
|
}
|
||||||
|
|
||||||
|
parser <- arg_parser("Simulate data and fit corrected models")
|
||||||
|
parser <- add_argument(parser, "--N", default=5000, help="number of observations of w")
|
||||||
|
parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
|
||||||
|
parser <- add_argument(parser, "--seed", default=51, help='seed for the rng')
|
||||||
|
parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather')
|
||||||
|
parser <- add_argument(parser, "--y_explained_variance", help='what proportion of the variance of y can be explained?', default=0.1)
|
||||||
|
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75)
|
||||||
|
parser <- add_argument(parser, "--accuracy_imbalance_difference", help='how much more accurate is the predictive model for one class than the other?', default=0.3)
|
||||||
|
parser <- add_argument(parser, "--Bzx", help='Effect of z on x', default=0.3)
|
||||||
|
parser <- add_argument(parser, "--Bzy", help='Effect of z on y', default=-0.3)
|
||||||
|
parser <- add_argument(parser, "--Bxy", help='Effect of z on y', default=0.3)
|
||||||
|
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
||||||
|
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y*z*x")
|
||||||
|
parser <- add_argument(parser, "--y_bias", help='coefficient of y on the probability a classification is correct', default=-0.5)
|
||||||
|
parser <- add_argument(parser, "--z_bias", help='coefficient of z on the probability a classification is correct', default=0)
|
||||||
|
parser <- add_argument(parser, "--truth_formula", help='formula for the true variable', default="x~z")
|
||||||
|
parser <- add_argument(parser, "--Px", help='base rate of x', default=0.5)
|
||||||
|
parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
|
||||||
|
parser <- add_argument(parser, "--sd_y_mixin", help='varience of the non-normal part of Y', default=10)
|
||||||
|
args <- parse_args(parser)
|
||||||
|
|
||||||
|
B0 <- 0
|
||||||
|
Px <- args$Px
|
||||||
|
Bxy <- args$Bxy
|
||||||
|
Bzy <- args$Bzy
|
||||||
|
Bzx <- args$Bzx
|
||||||
|
|
||||||
|
if(args$m < args$N){
|
||||||
|
|
||||||
|
df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$y_explained_variance, args$prediction_accuracy, y_bias=args$y_bias, sd_y_mixin=args$sd_y_mixin)
|
||||||
|
|
||||||
|
## df.pc <- df[,.(x,y,z,w_pred,w)]
|
||||||
|
## # df.pc <- df.pc[,err:=x-w_pred]
|
||||||
|
## pc.df <- pc(suffStat=list(C=cor(df.pc),n=nrow(df.pc)),indepTest=gaussCItest,labels=names(df.pc),alpha=0.05)
|
||||||
|
## plot(pc.df)
|
||||||
|
|
||||||
|
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy, 'Bzx'=args$Bzx, 'Bzy'=Bzy, 'Px'=Px, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'accuracy_imbalance_difference'=args$accuracy_imbalance_difference, 'y_bias'=args$y_bias,'outcome_formula'=args$outcome_formula, 'proxy_formula'=args$proxy_formula,truth_formula=args$truth_formula, confint_method=args$confint_method, error='', 'sd_y_mixin'=args$sd_y_mixin)
|
||||||
|
|
||||||
|
outline <- run_simulation(df, result, outcome_formula=as.formula(args$outcome_formula), proxy_formula=as.formula(args$proxy_formula), truth_formula=as.formula(args$truth_formula),confint_method=args$confint_method)
|
||||||
|
|
||||||
|
|
||||||
|
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||||
|
if(file.exists(args$outfile)){
|
||||||
|
logdata <- read_feather(args$outfile)
|
||||||
|
logdata <- rbind(logdata,as.data.table(outline), fill=TRUE)
|
||||||
|
} else {
|
||||||
|
logdata <- as.data.table(outline)
|
||||||
|
}
|
||||||
|
|
||||||
|
print(outline)
|
||||||
|
write_feather(logdata, args$outfile)
|
||||||
|
unlock(outfile_lock)
|
||||||
|
}
|
||||||
@@ -31,12 +31,12 @@ source("simulation_base.R")
|
|||||||
|
|
||||||
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
## one way to do it is by adding correlation to x.obs and y that isn't in w.
|
||||||
## in other words, the model is missing an important feature of x.obs that's related to y.
|
## in other words, the model is missing an important feature of x.obs that's related to y.
|
||||||
simulate_data <- function(N, m, B0, Bxy, Bzy, seed, prediction_accuracy=0.73, z_bias=-0.75){
|
simulate_data <- function(N, m, B0, Bxy, Bzy, Bxz=0, seed=0, prediction_accuracy=0.73, z_bias=-0.75){
|
||||||
set.seed(seed)
|
set.seed(seed)
|
||||||
|
|
||||||
# make w and y dependent
|
# make w and y dependent
|
||||||
z <- rnorm(N,sd=0.5)
|
z <- rnorm(N,sd=0.5)
|
||||||
x <- rbinom(N,1,0.5)
|
x <- rbinom(N,1,plogis(Bxz*z))
|
||||||
|
|
||||||
ystar <- Bzy * z + Bxy * x + B0
|
ystar <- Bzy * z + Bxy * x + B0
|
||||||
y <- rbinom(N,1,plogis(ystar))
|
y <- rbinom(N,1,plogis(ystar))
|
||||||
@@ -70,29 +70,32 @@ parser <- add_argument(parser, "--N", default=1000, help="number of observations
|
|||||||
parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
|
parser <- add_argument(parser, "--m", default=500, help="m the number of ground truth observations")
|
||||||
parser <- add_argument(parser, "--seed", default=17, help='seed for the rng')
|
parser <- add_argument(parser, "--seed", default=17, help='seed for the rng')
|
||||||
parser <- add_argument(parser, "--outfile", help='output file', default='example_4.feather')
|
parser <- add_argument(parser, "--outfile", help='output file', default='example_4.feather')
|
||||||
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.79)
|
parser <- add_argument(parser, "--prediction_accuracy", help='how accurate is the predictive model?', default=0.75)
|
||||||
## parser <- add_argument(parser, "--z_bias_y1", help='how is the classifier biased when y = 1?', default=-0.75)
|
## parser <- add_argument(parser, "--z_bias_y1", help='how is the classifier biased when y = 1?', default=-0.75)
|
||||||
## parser <- add_argument(parser, "--z_bias_y0", help='how is the classifier biased when y = 0 ?', default=0.75)
|
## parser <- add_argument(parser, "--z_bias_y0", help='how is the classifier biased when y = 0 ?', default=0.75)
|
||||||
parser <- add_argument(parser, "--z_bias", help='how is the classifier biased?', default=1.5)
|
parser <- add_argument(parser, "--z_bias", help='how is the classifier biased?', default=-0.5)
|
||||||
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.1)
|
parser <- add_argument(parser, "--Bxy", help='coefficient of x on y', default=0.7)
|
||||||
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.1)
|
parser <- add_argument(parser, "--Bzy", help='coeffficient of z on y', default=-0.7)
|
||||||
|
parser <- add_argument(parser, "--Bzx", help='coeffficient of z on y', default=1)
|
||||||
|
parser <- add_argument(parser, "--B0", help='coeffficient of z on y', default=0)
|
||||||
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
parser <- add_argument(parser, "--outcome_formula", help='formula for the outcome variable', default="y~x+z")
|
||||||
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y+z")
|
parser <- add_argument(parser, "--proxy_formula", help='formula for the proxy variable', default="w_pred~y+z")
|
||||||
|
parser <- add_argument(parser, "--confint_method", help='method for approximating confidence intervals', default='quad')
|
||||||
args <- parse_args(parser)
|
args <- parse_args(parser)
|
||||||
|
|
||||||
B0 <- 0
|
B0 <- args$B0
|
||||||
Bxy <- args$Bxy
|
Bxy <- args$Bxy
|
||||||
Bzy <- args$Bzy
|
Bzy <- args$Bzy
|
||||||
|
Bzx <- args$Bzx
|
||||||
|
|
||||||
if(args$m < args$N){
|
if(args$m < args$N){
|
||||||
df <- simulate_data(args$N, args$m, B0, Bxy, Bzy, args$seed, args$prediction_accuracy, args$z_bias)
|
df <- simulate_data(args$N, args$m, B0, Bxy, Bzx, Bzy, args$seed, args$prediction_accuracy, args$z_bias)
|
||||||
|
|
||||||
# result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'z_bias_y0'=args$z_bias_y0,'z_bias_y1'=args$z_bias_y1,'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula)
|
# result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy,'Bzx'=Bzx, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'z_bias_y0'=args$z_bias_y0,'z_bias_y1'=args$z_bias_y1,'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula)
|
||||||
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'z_bias'=args$z_bias,'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula)
|
result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bzy'=Bzy,'Bzx'=Bzx, 'seed'=args$seed, 'y_explained_variance'=args$y_explained_variance, 'prediction_accuracy'=args$prediction_accuracy, 'z_bias'=args$z_bias,'outcome_formula' = args$outcome_formula, 'proxy_formula' = args$proxy_formula, confint_method=args$confint_method)
|
||||||
|
|
||||||
outline <- run_simulation_depvar(df, result, outcome_formula = as.formula(args$outcome_formula), proxy_formula = as.formula(args$proxy_formula))
|
outline <- run_simulation_depvar(df, result, outcome_formula = as.formula(args$outcome_formula), proxy_formula = as.formula(args$proxy_formula),confint_method=args$confint_method)
|
||||||
|
print(outline$error.cor.z)
|
||||||
|
|
||||||
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE)
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
|
.ONESHELL:
|
||||||
SHELL=bash
|
SHELL=bash
|
||||||
|
|
||||||
Ns=[1000, 5000, 10000]
|
Ns=[1000, 5000, 10000]
|
||||||
@@ -6,8 +6,9 @@ ms=[100, 200, 400]
|
|||||||
seeds=[$(shell seq -s, 1 500)]
|
seeds=[$(shell seq -s, 1 500)]
|
||||||
explained_variances=[0.1]
|
explained_variances=[0.1]
|
||||||
|
|
||||||
all:remembr.RDS remember_irr.RDS
|
all:main supplement
|
||||||
supplement: remember_robustness_misspec.RDS
|
main:remembr.RDS
|
||||||
|
supplement:robustness_1.RDS robustness_1_dv.RDS robustness_2.RDS robustness_2_dv.RDS robustness_3.RDS robustness_3_dv.RDS robustness_3_proflik.RDS robustness_3_dv_proflik.RDS robustness_4.RDS robustness_4_dv.RDS robustness_5.RDS robustness_5_dv.RDS robustness_6.feather
|
||||||
|
|
||||||
srun=sbatch --wait --verbose run_job.sbatch
|
srun=sbatch --wait --verbose run_job.sbatch
|
||||||
|
|
||||||
@@ -15,7 +16,7 @@ srun=sbatch --wait --verbose run_job.sbatch
|
|||||||
joblists:example_1_jobs example_2_jobs example_3_jobs
|
joblists:example_1_jobs example_2_jobs example_3_jobs
|
||||||
|
|
||||||
# test_true_z_jobs: test_true_z.R simulation_base.R
|
# test_true_z_jobs: test_true_z.R simulation_base.R
|
||||||
# sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript test_true_z.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["test_true_z.feather"], "y_explained_variancevari":${explained_variances}, "Bzx":${Bzx}}' --outfile test_true_z_jobsb
|
# sbatch --wait --verbose run_job.sbatch ./grid_sweep.py --command "Rscript test_true_z.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["test_true_z.feather"], "y_explained_variancevari":${explained_variances}, "Bzx":${Bzx}}' --outfile test_true_z_jobsb
|
||||||
|
|
||||||
# test_true_z.feather: test_true_z_jobs
|
# test_true_z.feather: test_true_z_jobs
|
||||||
# rm -f test_true_z.feather
|
# rm -f test_true_z.feather
|
||||||
@@ -24,57 +25,54 @@ joblists:example_1_jobs example_2_jobs example_3_jobs
|
|||||||
|
|
||||||
|
|
||||||
example_1_jobs: 01_two_covariates.R simulation_base.R grid_sweep.py pl_methods.R
|
example_1_jobs: 01_two_covariates.R simulation_base.R grid_sweep.py pl_methods.R
|
||||||
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 01_two_covariates.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs
|
${srun} ./grid_sweep.py --command "Rscript 01_two_covariates.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"], "y_explained_variance":${explained_variances}, "Bzx":[1]}' --outfile example_1_jobs
|
||||||
|
|
||||||
example_1.feather: example_1_jobs
|
example_1.feather: example_1_jobs
|
||||||
rm -f example_1.feather
|
rm -f example_1.feather
|
||||||
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_1_jobs
|
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_1_jobs
|
||||||
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_1_jobs
|
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_1_jobs
|
||||||
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_1_jobs
|
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_1_jobs
|
||||||
sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 0 example_1_jobs
|
sbatch --wait --verbose --array=3001-$(shell cat example_1_jobs | wc -l) run_simulation.sbatch 0 example_1_jobs
|
||||||
sbatch --wait --verbose --array=4001-$(shell cat example_1_jobs | wc -l) run_simulation.sbatch 0 example_1_jobs
|
|
||||||
|
|
||||||
example_2_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py pl_methods.R
|
example_2_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py pl_methods.R
|
||||||
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y*z*x"]}' --outfile example_2_jobs
|
sbatch --wait --verbose run_job.sbatch ./grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+z+x"]}' --outfile example_2_jobs
|
||||||
|
|
||||||
example_2.feather: example_2_jobs
|
example_2.feather: example_2_jobs
|
||||||
rm -f example_2.feather
|
rm -f example_2.feather
|
||||||
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_2_jobs
|
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_2_jobs
|
||||||
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_2_jobs
|
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_2_jobs
|
||||||
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_2_jobs
|
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_2_jobs
|
||||||
sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 0 example_2_jobs
|
sbatch --wait --verbose --array=3001-$(shell cat example_2_jobs | wc -l) run_simulation.sbatch 0 example_2_jobs
|
||||||
sbatch --wait --verbose --array=4001-$(shell cat example_2_jobs | wc -l) run_simulation.sbatch 0 example_2_jobs
|
|
||||||
|
|
||||||
|
|
||||||
# example_2_B_jobs: example_2_B.R
|
# example_2_B_jobs: example_2_B.R
|
||||||
# sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs
|
# sbatch --wait --verbose run_job.sbatch python3 ./grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs
|
||||||
|
|
||||||
# example_2_B.feather: example_2_B_jobs
|
# example_2_B.feather: example_2_B_jobs
|
||||||
# rm -f example_2_B.feather
|
# rm -f example_2_B.feather
|
||||||
# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs
|
# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs
|
||||||
|
|
||||||
example_3_jobs: 03_depvar.R simulation_base.R grid_sweep.py pl_methods.R
|
example_3_jobs: 03_depvar.R simulation_base.R grid_sweep.py pl_methods.R
|
||||||
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 03_depvar.R" --arg_dict '{"N":${Ns},"m":${ms}, "Bxy":[0.7],"Bzy":[-0.7],"seed":${seeds}, "outfile":["example_3.feather"], "y_explained_variance":${explained_variances}}' --outfile example_3_jobs
|
sbatch --wait --verbose run_job.sbatch ./grid_sweep.py --command "Rscript 03_depvar.R" --arg_dict '{"N":${Ns},"m":${ms}, "Bxy":[0.7],"Bzy":[-0.7],"Bzx":[1],"seed":${seeds}, "outfile":["example_3.feather"], "y_explained_variance":${explained_variances}}' --outfile example_3_jobs
|
||||||
|
|
||||||
example_3.feather: example_3_jobs
|
example_3.feather: example_3_jobs
|
||||||
rm -f example_3.feather
|
rm -f example_3.feather
|
||||||
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_3_jobs
|
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_3_jobs
|
||||||
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_3_jobs
|
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_3_jobs
|
||||||
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_3_jobs
|
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_3_jobs
|
||||||
sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 0 example_3_jobs
|
sbatch --wait --verbose --array=3001-$(shell cat example_3_jobs | wc -l) run_simulation.sbatch 0 example_3_jobs
|
||||||
sbatch --wait --verbose --array=4001-$(shell cat example_3_jobs | wc -l) run_simulation.sbatch 0 example_3_jobs
|
|
||||||
|
|
||||||
example_4_jobs: 04_depvar_differential.R simulation_base.R grid_sweep.py pl_methods.R
|
example_4_jobs: 04_depvar_differential.R simulation_base.R grid_sweep.py pl_methods.R
|
||||||
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --outfile example_4_jobs
|
sbatch --wait --verbose run_job.sbatch ./grid_sweep.py --command "Rscript 04_depvar_differential.R" --arg_dict '{"N":${Ns},"Bxy":[0.7],"Bzy":[-0.7],"Bzx":[1], "m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[-0.5], "prediction_accuracy":[0.73]}' --outfile example_4_jobs
|
||||||
|
|
||||||
example_4.feather: example_4_jobs
|
example_4.feather: example_4_jobs
|
||||||
rm -f example_4.feather
|
rm -f example_4.feather
|
||||||
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_4_jobs
|
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_4_jobs
|
||||||
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_4_jobs
|
sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 0 example_4_jobs
|
||||||
|
sbatch --wait --verbose --array=2001-3001 run_simulation.sbatch 0 example_4_jobs
|
||||||
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_4_jobs
|
sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 0 example_4_jobs
|
||||||
sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 0 example_4_jobs
|
sbatch --wait --verbose --array=3001-$(shell cat example_4_jobs | wc -l) run_simulation.sbatch 0 example_4_jobs
|
||||||
sbatch --wait --verbose --array=4001-$(shell cat example_4_jobs | wc -l) run_simulation.sbatch 0 example_4_jobs
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
remembr.RDS:example_1.feather example_2.feather example_3.feather example_4.feather plot_example.R plot_dv_example.R summarize_estimator.R
|
remembr.RDS:example_1.feather example_2.feather example_3.feather example_4.feather plot_example.R plot_dv_example.R summarize_estimator.R
|
||||||
@@ -85,70 +83,397 @@ remembr.RDS:example_1.feather example_2.feather example_3.feather example_4.feat
|
|||||||
${srun} Rscript plot_dv_example.R --infile example_4.feather --name "plot.df.example.4"
|
${srun} Rscript plot_dv_example.R --infile example_4.feather --name "plot.df.example.4"
|
||||||
|
|
||||||
|
|
||||||
irr_Ns = [1000]
|
START=0
|
||||||
irr_ms = [150,300,600]
|
STEP=1000
|
||||||
irr_seeds=${seeds}
|
ONE=1
|
||||||
irr_explained_variances=${explained_variances}
|
|
||||||
irr_coder_accuracy=[0.80]
|
|
||||||
|
|
||||||
example_5_jobs: 05_irr_indep.R irr_simulation_base.R grid_sweep.py pl_methods.R measerr_methods.R
|
robustness_Ns=[1000,5000]
|
||||||
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 05_irr_indep.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_5.feather"], "y_explained_variance":${irr_explained_variances}, "coder_accuracy":${irr_coder_accuracy}}' --outfile example_5_jobs
|
robustness_ms=[100,200]
|
||||||
|
|
||||||
example_5.feather:example_5_jobs
|
#in robustness 1 / example 2 misclassification is correlated with Y.
|
||||||
rm -f example_5.feather
|
robustness_1_jobs_p1: 02_indep_differential.R simulation_base.R grid_sweep.py
|
||||||
sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_5_jobs
|
sbatch --wait --verbose run_job.sbatch ./grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":[1000],"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3,0],"Bxy":[0.3],"Bzx":[1,0], "outcome_formula":["y~x+z"], "z_bias":[0, 0.5], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs_p1
|
||||||
sbatch --wait --verbose --array=1001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 1000 example_5_jobs
|
|
||||||
# sbatch --wait --verbose --array=2001-3000 run_simulation.sbatch 2000 example_5_jobs
|
robustness_1_jobs_p2: 02_indep_differential.R simulation_base.R grid_sweep.py
|
||||||
# sbatch --wait --verbose --array=3001-4000 run_simulation.sbatch 3000 example_5_jobs
|
sbatch --wait --verbose run_job.sbatch ./grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":[5000],"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3,0],"Bxy":[0.3],"Bzx":[1,0], "outcome_formula":["y~x+z"], "z_bias":[0, 0.5], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs_p2
|
||||||
# sbatch --wait --verbose --array=2001-$(shell cat example_5_jobs | wc -l) run_simulation.sbatch 4000 example_5_jobs
|
|
||||||
|
robustness_1.feather: robustness_1_jobs_p1 robustness_1_jobs_p2
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_1_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_1_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_1_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_1_jobs_p2;)
|
||||||
|
|
||||||
|
robustness_1.RDS: robustness_1.feather summarize_estimator.R
|
||||||
|
rm -f robustness_1.RDS
|
||||||
|
${srun} Rscript plot_example.R --infile $< --name "robustness_1" --remember-file $@
|
||||||
|
|
||||||
|
# when Bzy is 0 and zbias is not zero, we have the case where P(W|Y,X,Z) has an omitted variable that is conditionanlly independent from Y. Note that X and Z are independent in this scenario.
|
||||||
|
robustness_1_dv_jobs_p1: simulation_base.R 04_depvar_differential.R grid_sweep.py
|
||||||
|
${srun} ./grid_sweep.py --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":[1000],"Bzx":[1], "Bxy":[0.7,0],"Bzy":[-0.7,0],"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_1_dv.feather"], "proxy_formula":["w_pred~y"],"z_bias":[-0.5]}' --outfile robustness_1_dv_jobs_p1
|
||||||
|
|
||||||
|
robustness_1_dv_jobs_p2: simulation_base.R 04_depvar_differential.R grid_sweep.py
|
||||||
|
${srun} ./grid_sweep.py --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":[5000],"Bzx":[1], "Bxy":[0.7,0],"Bzy":[-0.7,0],"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_1_dv.feather"], "proxy_formula":["w_pred~y"],"z_bias":[-0.5]}' --outfile robustness_1_dv_jobs_p2
|
||||||
|
|
||||||
|
robustness_1_dv.feather: robustness_1_dv_jobs_p1 robustness_1_dv_jobs_p2
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_1_dv_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_1_dv_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_1_dv_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_1_dv_jobs_p2;)
|
||||||
|
|
||||||
|
robustness_1_dv.RDS: robustness_1_dv.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript plot_dv_example.R --infile $< --name "robustness_1_dv" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_2_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_jobs_p4: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2.feather: robustness_2_jobs_p1 robustness_2_jobs_p2 robustness_2_jobs_p3 robustness_2_jobs_p4
|
||||||
|
rm $@
|
||||||
|
$(eval END_1!=cat robustness_2_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_2_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_2_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_2_jobs_p4 | wc -l)
|
||||||
|
$(eval ITEMS_4!=seq $(START) $(STEP) $(END_4))
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_jobs_p4;)
|
||||||
|
|
||||||
|
robustness_2.RDS: plot_example.R robustness_2.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_2" --remember-file $@
|
||||||
|
|
||||||
|
robustness_2_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.60,0.65]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.70,0.75]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.80,0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_dv_jobs_p4: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_2_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.90,0.95]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_2_dv.feather: robustness_2_dv_jobs_p1 robustness_2_dv_jobs_p2 robustness_2_dv_jobs_p3 robustness_2_dv_jobs_p4
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_2_dv_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_2_dv_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_2_dv_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_2_dv_jobs_p4 | wc -l)
|
||||||
|
$(eval ITEMS_4!=seq $(START) $(STEP) $(END_4))
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_2_dv_jobs_p4;)
|
||||||
|
|
||||||
|
robustness_2_dv.RDS: plot_dv_example.R robustness_2_dv.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_2_dv" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_3_proflik_jobs: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":[1000],"m":[100], "seed":${seeds}, "outfile":["robustness_3_proflik.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"Px":[0.5,0.6,0.7,0.8,0.9,0.95], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85], "confint_method":['spline']}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3_proflik.feather: robustness_3_proflik_jobs
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_3_proflik_jobs | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_proflik_jobs;)
|
||||||
|
|
||||||
|
robustness_3_proflik.RDS: plot_example.R robustness_3_proflik.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_3_proflik" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_3_jobs_p1: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"Px":[0.5,0.6], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3_jobs_p2: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"Px":[0.7,0.8], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3_jobs_p3: grid_sweep.py 01_two_covariates.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 01_two_covariates.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_3.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"Px":[0.9,0.95], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~z"], "prediction_accuracy":[0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3.feather: robustness_3_jobs_p1 robustness_3_jobs_p2 robustness_3_jobs_p3
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_3_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_3_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_3_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_jobs_p3;)
|
||||||
|
|
||||||
|
robustness_3.RDS: plot_example.R robustness_3.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_3" --remember-file $@
|
||||||
|
|
||||||
|
robustness_3_dv_proflik_jobs: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":[1000],"m":[100], "seed":${seeds}, "outfile":["robustness_3_dv_proflik.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"B0":[0,0.405,0.846,1.386,2.197,2.944], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85],"confint_method":['spline']}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3_dv_proflik.feather: robustness_3_dv_proflik_jobs
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_3_dv_proflik_jobs | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_proflik_jobs;)
|
||||||
|
|
||||||
|
robustness_3_dv_proflik.RDS: plot_dv_example.R robustness_3_dv_proflik.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_3_dv_proflik" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_3_dv_jobs_p1: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_3_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"B0":[0,0.405], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3_dv_jobs_p2: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_3_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1],"B0":[0.847,1.386], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_3_dv_jobs_p3: grid_sweep.py 03_depvar.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_depvar.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_3_dv.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "B0":[2.197,2.944], "outcome_formula":["y~x+z"], "prediction_accuracy":[0.85]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_3_dv.feather: robustness_3_dv_jobs_p1 robustness_3_dv_jobs_p2 robustness_3_dv_jobs_p3
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_3_dv_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_3_dv_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_3_dv_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_3_dv_jobs_p3;)
|
||||||
|
|
||||||
|
|
||||||
|
robustness_3_dv.RDS: plot_dv_example.R robustness_3_dv.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_3_dv" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# example_6_jobs: 06_irr_dv.R irr_dv_simulation_base.R grid_sweep.py pl_methods.R
|
robustness_4_jobs_p1: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
# sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 06_irr_dv.R" --arg_dict '{"N":${irr_Ns},"m":${irr_ms}, "seed":${irr_seeds}, "outfile":["example_6.feather"], "y_explained_variance":${irr_explained_variances},"coder_accuracy":${irr_coder_accuracy}}' --outfile example_6_jobs
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"], "y_bias":[-2.944,-2.197]}' --outfile $@
|
||||||
|
|
||||||
# example_6.feather:example_6_jobs
|
robustness_4_jobs_p2: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
# rm -f example_6.feather
|
rm -f $@
|
||||||
# sbatch --wait --verbose --array=1-1000 run_simulation.sbatch 0 example_6_jobs
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"], "y_bias":[-1.386,-0.846]}' --outfile $@
|
||||||
# sbatch --wait --verbose --array=1001-2000 run_simulation.sbatch 1000 example_6_jobs
|
|
||||||
# sbatch --wait --verbose --array=2001-$(shell cat example_6_jobs | wc -l) run_simulation.sbatch 2000 example_6_jobs
|
robustness_4_jobs_p3: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
|
rm -f ./$@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"], "y_bias":[-0.405,-0.25]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_4_jobs_p4: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
|
rm -f ./$@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"], "y_bias":[0,-0.1]}' --outfile $@
|
||||||
|
|
||||||
|
|
||||||
remember_irr.RDS: example_5.feather plot_irr_example.R plot_irr_dv_example.R summarize_estimator.R
|
robustness_4.feather: robustness_4_jobs_p1 robustness_4_jobs_p2 robustness_4_jobs_p3
|
||||||
rm -f remember_irr.RDS
|
rm -f $@
|
||||||
sbatch --wait --verbose run_job.sbatch Rscript plot_irr_example.R --infile example_5.feather --name "plot.df.example.5"
|
$(eval END_1!=cat robustness_4_jobs_p1 | wc -l)
|
||||||
# sbatch --wait --verbose run_job.sbatch Rscript plot_irr_dv_example.R --infile example_6.feather --name "plot.df.example.6"
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_4_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_4_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_4_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_4))
|
||||||
|
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p1;)
|
||||||
robustness_1_jobs: 02_indep_differential.R simulation_base.R grid_sweep.py
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p2;)
|
||||||
sbatch --wait --verbose run_job.sbatch grid_sweep.py --command "Rscript 02_indep_differential.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["robustness_1.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x"], "truth_formula":["x~1"]}' --outfile robustness_1_jobs
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_jobs_p3;)
|
||||||
|
|
||||||
|
|
||||||
|
robustness_4.RDS: plot_example.R robustness_4.feather summarize_estimator.R
|
||||||
robustness_1.feather: robustness_1_jobs
|
rm -f $@
|
||||||
rm -f robustness_1.feather
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_4" --remember-file $@
|
||||||
sbatch --wait --verbose --array=1-$(shell cat robustness_1_jobs | wc -l) run_simulation.sbatch 0 robustness_1_jobs
|
|
||||||
|
|
||||||
robustness_1_dv_jobs: simulation_base.R 04_depvar_differential.R grid_sweep.py
|
|
||||||
${srun} bash -c "source ~/.bashrc && grid_sweep.py --command 'Rscript 04_depvar_differential.R' --arg_dict \"{'N':${Ns},'m':${ms}, 'seed':${seeds}, 'outfile':['robustness_1_dv.feather'], 'y_explained_variance':${explained_variances}, 'proxy_formula':['w_pred~y']}\" --outfile robustness_1_dv_jobs"
|
|
||||||
|
|
||||||
|
|
||||||
robustness_1_dv.feather: robustness_1_dv_jobs
|
# '{"N":${robustness_Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --example_4_jobs
|
||||||
rm -f robustness_1_dv.feather
|
|
||||||
sbatch --wait --verbose --array=1-$(shell cat example_3_jobs | wc -l) run_simulation.sbatch 0 robustness_1_dv_jobs
|
robustness_4_dv_jobs_p1: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4_dv.feather"], "Bzy":[-0.7],"Bxy":[0.7],"Bzx":[1], "outcome_formula":["y~x+z"], "z_bias":[0,0.1]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_4_dv_jobs_p2: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4_dv.feather"], "Bzy":[-0.7],"Bxy":[0.7],"Bzx":[1], "outcome_formula":["y~x+z"], "z_bias":[0.25,0.405]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_4_dv_jobs_p3: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4_dv.feather"], "Bzy":[-0.7],"Bxy":[0.7],"Bzx":[1],"outcome_formula":["y~x+z"],"z_bias":[0.846,1.386]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_4_dv_jobs_p4: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_4_dv.feather"],"Bzy":[-0.7],"Bxy":[0.7],"Bzx":[1], "outcome_formula":["y~x+z"], "z_bias":[2.197,2.944]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_4_dv.feather: robustness_4_dv_jobs_p1 robustness_4_dv_jobs_p2 robustness_4_dv_jobs_p3 robustness_4_dv_jobs_p4
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_4_dv_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_4_dv_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_4_dv_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_4_dv_jobs_p4 | wc -l)
|
||||||
|
$(eval ITEMS_4!=seq $(START) $(STEP) $(END_4))
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_4_dv_jobs_p4;)
|
||||||
|
|
||||||
|
|
||||||
remember_robustness_misspec.RDS: robustness_1.feather robustness_1_dv.feather
|
robustness_4_dv.RDS: plot_dv_example.R robustness_4_dv.feather summarize_estimator.R
|
||||||
rm -f remember_robustness_misspec.RDS
|
rm -f $@
|
||||||
sbatch --wait --verbose run_job.sbatch Rscript plot_example.R --infile robustness_1.feather --name "plot.df.robustness.1" --remember-file "remember_robustness_misspec.RDS"
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_4_dv" --remember-file $@
|
||||||
sbatch --wait --verbose run_job.sbatch Rscript plot_dv_example.R --infile robustness_1_dv.feather --name "plot.df.robustness.1.dv" --remember-file "remember_robustness_mispec.RDS"
|
|
||||||
|
|
||||||
|
|
||||||
clean:
|
robustness_5_jobs_p1: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[1.386,2.197], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_jobs_p2: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0.405,0.846], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_jobs_p3: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[0,0.25], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_jobs_p4: grid_sweep.py 02_indep_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 02_indep_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3],"Bzx":[2.944], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"]}' --outfile $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_5.feather: robustness_5_jobs_p1 robustness_5_jobs_p2 robustness_5_jobs_p3
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_5_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_5_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_5_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_5_jobs_p4 | wc -l)
|
||||||
|
$(eval ITEMS_4!=seq $(START) $(STEP) $(END_4))
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_jobs_p4;)
|
||||||
|
|
||||||
|
robustness_5.RDS: plot_example.R robustness_5.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_5" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
# '{"N":${robustness_Ns},"Bxy":[0.7],"Bzy":[-0.7],"m":${ms}, "seed":${seeds}, "outfile":["example_4.feather"], "z_bias":[0.5]}' --example_4_jobs
|
||||||
|
|
||||||
|
robustness_5_dv_jobs_p1: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5_dv.feather"], "Bzy":[-0.7],"Bxy":[0.7],"Bzx":[0,0.25], "outcome_formula":["y~x+z"], "z_bias":[-0.5]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_dv_jobs_p2: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5_dv.feather"], "Bzy":[-0.7],"Bxy":[0.7],"Bzx":[0.405,0.846], "outcome_formula":["y~x+z"], "z_bias":[-0.5]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_dv_jobs_p3: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5_dv.feather"], "Bzy":[-0.7],"Bxy":[0.7],"Bzx":[1.386,2.197],"outcome_formula":["y~x+z"], "z_bias":[-0.5]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_dv_jobs_p4: grid_sweep.py 04_depvar_differential.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 04_depvar_differential.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_5_dv.feather"],"Bzy":[-0.7],"Bxy":[0.7],"Bzx":[2.944], "outcome_formula":["y~x+z"], "z_bias":[-0.5]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_5_dv.feather: robustness_5_dv_jobs_p1 robustness_5_dv_jobs_p2 robustness_5_dv_jobs_p3 robustness_5_dv_jobs_p4
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_5_dv_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_5_dv_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_5_dv_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_5_dv_jobs_p4 | wc -l)
|
||||||
|
$(eval ITEMS_4!=seq $(START) $(STEP) $(END_4))
|
||||||
|
|
||||||
|
|
||||||
|
$(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_dv_jobs_p1;)
|
||||||
|
$(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_dv_jobs_p2;)
|
||||||
|
$(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_dv_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_5_dv_jobs_p4;)
|
||||||
|
|
||||||
|
|
||||||
|
robustness_5_dv.RDS: plot_dv_example.R robustness_5_dv.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_5_dv" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
|
clean_main:
|
||||||
|
rm -f remembr.RDS
|
||||||
|
rm -f example_1_jobs
|
||||||
|
rm -f example_2_jobs
|
||||||
|
rm -f example_3_jobs
|
||||||
|
rm -f example_4_jobs
|
||||||
|
rm -f example_1.feather
|
||||||
|
rm -f example_2.feather
|
||||||
|
rm -f example_3.feather
|
||||||
|
rm -f example_4.feather
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
clean_all:
|
||||||
rm *.feather
|
rm *.feather
|
||||||
rm -f remembr.RDS
|
rm -f remembr.RDS
|
||||||
|
rm -f remembr*.RDS
|
||||||
|
rm -f robustness*.RDS
|
||||||
rm -f example_*_jobs
|
rm -f example_*_jobs
|
||||||
|
rm -f robustness_*_jobs_*
|
||||||
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs
|
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs
|
||||||
|
|
||||||
# example_2_B_mecor_jobs:
|
# example_2_B_mecor_jobs:
|
||||||
@@ -159,6 +484,44 @@ clean:
|
|||||||
# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs
|
# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs
|
||||||
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs
|
# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs
|
||||||
|
|
||||||
|
robustness_6_jobs_p1: grid_sweep.py 03_indep_differential_nonnorm.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_indep_differential_nonnorm.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_6.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"],"sd_y_mixin":[0,1,2.5]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_6_jobs_p2: grid_sweep.py 03_indep_differential_nonnorm.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_indep_differential_nonnorm.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_6.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"],"sd_y_mixin":[5,10]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_6_jobs_p3: grid_sweep.py 03_indep_differential_nonnorm.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_indep_differential_nonnorm.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_6.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"],"sd_y_mixin":[0,1,2.5],"y_bias":[0]}' --outfile $@
|
||||||
|
|
||||||
|
robustness_6_jobs_p4: grid_sweep.py 03_indep_differential_nonnorm.R simulation_base.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} ./$< --command 'Rscript 03_indep_differential_nonnorm.R' --arg_dict '{"N":${robustness_Ns},"m":${robustness_ms}, "seed":${seeds}, "outfile":["robustness_6.feather"],"y_explained_variance":${explained_variances}, "Bzy":[-0.3],"Bxy":[0.3], "outcome_formula":["y~x+z"], "proxy_formula":["w_pred~y+x+z"], "truth_formula":["x~z"],"sd_y_mixin":[5,10],"y_bias":[0]}' --outfile $@
|
||||||
|
|
||||||
|
|
||||||
|
robustness_6.feather: robustness_6_jobs_p1 robustness_6_jobs_p2 robustness_6_jobs_p3 robustness_6_jobs_p4
|
||||||
|
rm -f $@
|
||||||
|
$(eval END_1!=cat robustness_6_jobs_p1 | wc -l)
|
||||||
|
$(eval ITEMS_1!=seq $(START) $(STEP) $(END_1))
|
||||||
|
$(eval END_2!=cat robustness_6_jobs_p2 | wc -l)
|
||||||
|
$(eval ITEMS_2!=seq $(START) $(STEP) $(END_2))
|
||||||
|
$(eval END_3!=cat robustness_6_jobs_p3 | wc -l)
|
||||||
|
$(eval ITEMS_3!=seq $(START) $(STEP) $(END_3))
|
||||||
|
$(eval END_4!=cat robustness_6_jobs_p4 | wc -l)
|
||||||
|
$(eval ITEMS_4!=seq $(START) $(STEP) $(END_4))
|
||||||
|
|
||||||
|
|
||||||
|
# $(foreach item,$(ITEMS_1),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_6_jobs_p1;)
|
||||||
|
# $(foreach item,$(ITEMS_2),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_6_jobs_p2;)
|
||||||
|
# $(foreach item,$(ITEMS_3),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_6_jobs_p3;)
|
||||||
|
$(foreach item,$(ITEMS_4),sbatch --wait --verbose --array=$(shell expr $(item) + $(ONE))-$(shell expr $(item) + $(STEP)) run_simulation.sbatch 0 robustness_6_jobs_p4;)
|
||||||
|
|
||||||
|
|
||||||
|
robustness_6.RDS: plot_example.R robustness_6.feather summarize_estimator.R
|
||||||
|
rm -f $@
|
||||||
|
${srun} Rscript $< --infile $(word 2, $^) --name "robustness_6" --remember-file $@
|
||||||
|
|
||||||
|
|
||||||
.PHONY: supplement
|
.PHONY: supplement
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from itertools import product
|
|||||||
import pyRemembeR
|
import pyRemembeR
|
||||||
|
|
||||||
def main(command, arg_dict, outfile, remember_file='remember_grid_sweep.RDS'):
|
def main(command, arg_dict, outfile, remember_file='remember_grid_sweep.RDS'):
|
||||||
|
print(remember_file)
|
||||||
remember = pyRemembeR.remember.Remember()
|
remember = pyRemembeR.remember.Remember()
|
||||||
remember.set_file(remember_file)
|
remember.set_file(remember_file)
|
||||||
remember[outfile] = arg_dict
|
remember[outfile] = arg_dict
|
||||||
|
|||||||
@@ -15,18 +15,38 @@ library(bbmle)
|
|||||||
|
|
||||||
### ideal formulas for example 2
|
### ideal formulas for example 2
|
||||||
# test.fit.2 <- measerr_mle(df, y ~ x + z, gaussian(), w_pred ~ x + z + y + y:x, binomial(link='logit'), x ~ z)
|
# test.fit.2 <- measerr_mle(df, y ~ x + z, gaussian(), w_pred ~ x + z + y + y:x, binomial(link='logit'), x ~ z)
|
||||||
|
likelihood.logistic <- function(model.params, outcome, model.matrix){
|
||||||
|
ll <- vector(mode='numeric', length=length(outcome))
|
||||||
|
ll[outcome == 1] <- plogis(model.params %*% t(model.matrix[outcome==1,]), log=TRUE)
|
||||||
|
ll[outcome == 0] <- plogis(model.params %*% t(model.matrix[outcome==0,]), log=TRUE, lower.tail=FALSE)
|
||||||
|
return(ll)
|
||||||
|
}
|
||||||
|
|
||||||
## outcome_formula <- y ~ x + z; proxy_formula <- w_pred ~ y + x + z + x:z + x:y + z:y
|
## outcome_formula <- y ~ x + z; proxy_formula <- w_pred ~ y + x + z + x:z + x:y + z:y
|
||||||
measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula, proxy_family=binomial(link='logit'),method='optim'){
|
measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), proxy_formula, proxy_family=binomial(link='logit'),maxit=1e6, method='optim',optim_method='L-BFGS-B'){
|
||||||
|
df.obs <- model.frame(outcome_formula, df)
|
||||||
|
proxy.model.matrix <- model.matrix(proxy_formula, df)
|
||||||
|
proxy.variable <- all.vars(proxy_formula)[1]
|
||||||
|
|
||||||
|
df.proxy.obs <- model.frame(proxy_formula,df)
|
||||||
|
proxy.obs <- with(df.proxy.obs, eval(parse(text=proxy.variable)))
|
||||||
|
|
||||||
|
response.var <- all.vars(outcome_formula)[1]
|
||||||
|
y.obs <- with(df.obs,eval(parse(text=response.var)))
|
||||||
|
outcome.model.matrix <- model.matrix(outcome_formula, df.obs)
|
||||||
|
|
||||||
|
df.unobs <- df[is.na(df[[response.var]])]
|
||||||
|
df.unobs.y1 <- copy(df.unobs)
|
||||||
|
df.unobs.y1[[response.var]] <- 1
|
||||||
|
df.unobs.y0 <- copy(df.unobs)
|
||||||
|
df.unobs.y0[[response.var]] <- 0
|
||||||
|
|
||||||
|
outcome.model.matrix.y1 <- model.matrix(outcome_formula, df.unobs.y1)
|
||||||
|
proxy.model.matrix.y1 <- model.matrix(proxy_formula, df.unobs.y1)
|
||||||
|
proxy.model.matrix.y0 <- model.matrix(proxy_formula, df.unobs.y0)
|
||||||
|
proxy.unobs <- with(df.unobs, eval(parse(text=proxy.variable)))
|
||||||
|
|
||||||
nll <- function(params){
|
nll <- function(params){
|
||||||
df.obs <- model.frame(outcome_formula, df)
|
|
||||||
proxy.variable <- all.vars(proxy_formula)[1]
|
|
||||||
proxy.model.matrix <- model.matrix(proxy_formula, df)
|
|
||||||
response.var <- all.vars(outcome_formula)[1]
|
|
||||||
y.obs <- with(df.obs,eval(parse(text=response.var)))
|
|
||||||
outcome.model.matrix <- model.matrix(outcome_formula, df.obs)
|
|
||||||
|
|
||||||
param.idx <- 1
|
param.idx <- 1
|
||||||
n.outcome.model.covars <- dim(outcome.model.matrix)[2]
|
n.outcome.model.covars <- dim(outcome.model.matrix)[2]
|
||||||
@@ -39,12 +59,9 @@ measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='lo
|
|||||||
ll.y.obs[y.obs==0] <- plogis(outcome.params %*% t(outcome.model.matrix[y.obs==0,]),log=TRUE,lower.tail=FALSE)
|
ll.y.obs[y.obs==0] <- plogis(outcome.params %*% t(outcome.model.matrix[y.obs==0,]),log=TRUE,lower.tail=FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
df.obs <- model.frame(proxy_formula,df)
|
|
||||||
n.proxy.model.covars <- dim(proxy.model.matrix)[2]
|
n.proxy.model.covars <- dim(proxy.model.matrix)[2]
|
||||||
proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
|
proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
|
||||||
|
|
||||||
param.idx <- param.idx + n.proxy.model.covars
|
param.idx <- param.idx + n.proxy.model.covars
|
||||||
proxy.obs <- with(df.obs, eval(parse(text=proxy.variable)))
|
|
||||||
|
|
||||||
if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
|
if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
|
||||||
ll.w.obs <- vector(mode='numeric',length=dim(proxy.model.matrix)[1])
|
ll.w.obs <- vector(mode='numeric',length=dim(proxy.model.matrix)[1])
|
||||||
@@ -54,14 +71,7 @@ measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='lo
|
|||||||
|
|
||||||
ll.obs <- sum(ll.y.obs + ll.w.obs)
|
ll.obs <- sum(ll.y.obs + ll.w.obs)
|
||||||
|
|
||||||
df.unobs <- df[is.na(df[[response.var]])]
|
|
||||||
df.unobs.y1 <- copy(df.unobs)
|
|
||||||
df.unobs.y1[[response.var]] <- 1
|
|
||||||
df.unobs.y0 <- copy(df.unobs)
|
|
||||||
df.unobs.y0[[response.var]] <- 0
|
|
||||||
|
|
||||||
## integrate out y
|
## integrate out y
|
||||||
outcome.model.matrix.y1 <- model.matrix(outcome_formula, df.unobs.y1)
|
|
||||||
|
|
||||||
if((outcome_family$family == "binomial") & (outcome_family$link == 'logit')){
|
if((outcome_family$family == "binomial") & (outcome_family$link == 'logit')){
|
||||||
ll.y.unobs.1 <- vector(mode='numeric', length=dim(outcome.model.matrix.y1)[1])
|
ll.y.unobs.1 <- vector(mode='numeric', length=dim(outcome.model.matrix.y1)[1])
|
||||||
@@ -70,10 +80,6 @@ measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='lo
|
|||||||
ll.y.unobs.0 <- plogis(outcome.params %*% t(outcome.model.matrix.y1),log=TRUE,lower.tail=FALSE)
|
ll.y.unobs.0 <- plogis(outcome.params %*% t(outcome.model.matrix.y1),log=TRUE,lower.tail=FALSE)
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy.model.matrix.y1 <- model.matrix(proxy_formula, df.unobs.y1)
|
|
||||||
proxy.model.matrix.y0 <- model.matrix(proxy_formula, df.unobs.y0)
|
|
||||||
proxy.unobs <- with(df.unobs, eval(parse(text=proxy.variable)))
|
|
||||||
|
|
||||||
if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
|
if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
|
||||||
ll.w.unobs.1 <- vector(mode='numeric',length=dim(proxy.model.matrix.y1)[1])
|
ll.w.unobs.1 <- vector(mode='numeric',length=dim(proxy.model.matrix.y1)[1])
|
||||||
ll.w.unobs.0 <- vector(mode='numeric',length=dim(proxy.model.matrix.y0)[1])
|
ll.w.unobs.0 <- vector(mode='numeric',length=dim(proxy.model.matrix.y0)[1])
|
||||||
@@ -100,7 +106,7 @@ measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='lo
|
|||||||
names(start) <- params
|
names(start) <- params
|
||||||
|
|
||||||
if(method=='optim'){
|
if(method=='optim'){
|
||||||
fit <- optim(start, fn = nll, lower=lower, method='L-BFGS-B', hessian=TRUE, control=list(maxit=1e6))
|
fit <- optim(start, fn = nll, lower=lower, method=optim_method, hessian=TRUE, control=list(maxit=maxit))
|
||||||
} else {
|
} else {
|
||||||
quoted.names <- gsub("[\\(\\)]",'',names(start))
|
quoted.names <- gsub("[\\(\\)]",'',names(start))
|
||||||
print(quoted.names)
|
print(quoted.names)
|
||||||
@@ -109,13 +115,13 @@ measerr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='lo
|
|||||||
measerr_mle_nll <- eval(parse(text=text))
|
measerr_mle_nll <- eval(parse(text=text))
|
||||||
names(start) <- quoted.names
|
names(start) <- quoted.names
|
||||||
names(lower) <- quoted.names
|
names(lower) <- quoted.names
|
||||||
fit <- mle2(minuslogl=measerr_mle_nll, start=start, lower=lower, parnames=params,control=list(maxit=1e6),method='L-BFGS-B')
|
fit <- mle2(minuslogl=measerr_mle_nll, start=start, lower=lower, parnames=params,control=list(maxit=maxit),method=optim_method)
|
||||||
}
|
}
|
||||||
return(fit)
|
return(fit)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_formula, proxy_family=binomial(link='logit'), truth_formula, truth_family=binomial(link='logit'),method='optim'){
|
measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_formula, proxy_family=binomial(link='logit'), truth_formula, truth_family=binomial(link='logit'),method='optim', maxit=1e6, optim_method='L-BFGS-B'){
|
||||||
|
|
||||||
df.obs <- model.frame(outcome_formula, df)
|
df.obs <- model.frame(outcome_formula, df)
|
||||||
response.var <- all.vars(outcome_formula)[1]
|
response.var <- all.vars(outcome_formula)[1]
|
||||||
@@ -125,6 +131,31 @@ measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_fo
|
|||||||
proxy.model.matrix <- model.matrix(proxy_formula, df)
|
proxy.model.matrix <- model.matrix(proxy_formula, df)
|
||||||
y.obs <- with(df.obs,eval(parse(text=response.var)))
|
y.obs <- with(df.obs,eval(parse(text=response.var)))
|
||||||
|
|
||||||
|
df.proxy.obs <- model.frame(proxy_formula,df)
|
||||||
|
proxy.obs <- with(df.proxy.obs, eval(parse(text=proxy.variable)))
|
||||||
|
n.proxy.model.covars <- dim(proxy.model.matrix)[2]
|
||||||
|
|
||||||
|
df.truth.obs <- model.frame(truth_formula, df)
|
||||||
|
truth.obs <- with(df.truth.obs, eval(parse(text=truth.variable)))
|
||||||
|
truth.model.matrix <- model.matrix(truth_formula,df.truth.obs)
|
||||||
|
n.truth.model.covars <- dim(truth.model.matrix)[2]
|
||||||
|
|
||||||
|
df.unobs <- df[is.na(eval(parse(text=truth.variable)))]
|
||||||
|
df.unobs.x1 <- copy(df.unobs)
|
||||||
|
df.unobs.x1[,truth.variable] <- 1
|
||||||
|
df.unobs.x0 <- copy(df.unobs)
|
||||||
|
df.unobs.x0[,truth.variable] <- 0
|
||||||
|
outcome.unobs <- with(df.unobs, eval(parse(text=response.var)))
|
||||||
|
|
||||||
|
outcome.model.matrix.x0 <- model.matrix(outcome_formula, df.unobs.x0)
|
||||||
|
outcome.model.matrix.x1 <- model.matrix(outcome_formula, df.unobs.x1)
|
||||||
|
|
||||||
|
proxy.model.matrix.x0 <- model.matrix(proxy_formula, df.unobs.x0)
|
||||||
|
proxy.model.matrix.x1 <- model.matrix(proxy_formula, df.unobs.x1)
|
||||||
|
proxy.unobs <- df.unobs[[proxy.variable]]
|
||||||
|
|
||||||
|
truth.model.matrix.unobs <- model.matrix(truth_formula, df.unobs.x0)
|
||||||
|
|
||||||
measerr_mle_nll <- function(params){
|
measerr_mle_nll <- function(params){
|
||||||
param.idx <- 1
|
param.idx <- 1
|
||||||
n.outcome.model.covars <- dim(outcome.model.matrix)[2]
|
n.outcome.model.covars <- dim(outcome.model.matrix)[2]
|
||||||
@@ -137,82 +168,48 @@ measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_fo
|
|||||||
param.idx <- param.idx + 1
|
param.idx <- param.idx + 1
|
||||||
# outcome_formula likelihood using linear regression
|
# outcome_formula likelihood using linear regression
|
||||||
ll.y.obs <- dnorm(y.obs, outcome.params %*% t(outcome.model.matrix),sd=sigma.y, log=TRUE)
|
ll.y.obs <- dnorm(y.obs, outcome.params %*% t(outcome.model.matrix),sd=sigma.y, log=TRUE)
|
||||||
}
|
} else if( (outcome_family$family == "binomial") & (outcome_family$link == "logit") )
|
||||||
|
ll.y.obs <- likelihood.logistic(outcome.params, y.obs, outcome.model.matrix)
|
||||||
|
|
||||||
|
|
||||||
df.obs <- model.frame(proxy_formula,df)
|
|
||||||
n.proxy.model.covars <- dim(proxy.model.matrix)[2]
|
|
||||||
proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
|
proxy.params <- params[param.idx:(n.proxy.model.covars+param.idx-1)]
|
||||||
param.idx <- param.idx + n.proxy.model.covars
|
param.idx <- param.idx + n.proxy.model.covars
|
||||||
proxy.obs <- with(df.obs, eval(parse(text=proxy.variable)))
|
|
||||||
|
|
||||||
if( (proxy_family$family=="binomial") & (proxy_family$link=='logit')){
|
if( (proxy_family$family=="binomial") & (proxy_family$link=='logit'))
|
||||||
ll.w.obs <- vector(mode='numeric',length=dim(proxy.model.matrix)[1])
|
ll.w.obs <- likelihood.logistic(proxy.params, proxy.obs, proxy.model.matrix)
|
||||||
|
|
||||||
# proxy_formula likelihood using logistic regression
|
|
||||||
ll.w.obs[proxy.obs==1] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==1,]),log=TRUE)
|
|
||||||
ll.w.obs[proxy.obs==0] <- plogis(proxy.params %*% t(proxy.model.matrix[proxy.obs==0,]),log=TRUE, lower.tail=FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
df.obs <- model.frame(truth_formula, df)
|
|
||||||
|
|
||||||
truth.obs <- with(df.obs, eval(parse(text=truth.variable)))
|
|
||||||
truth.model.matrix <- model.matrix(truth_formula,df)
|
|
||||||
n.truth.model.covars <- dim(truth.model.matrix)[2]
|
|
||||||
|
|
||||||
truth.params <- params[param.idx:(n.truth.model.covars + param.idx - 1)]
|
truth.params <- params[param.idx:(n.truth.model.covars + param.idx - 1)]
|
||||||
|
|
||||||
if( (truth_family$family=="binomial") & (truth_family$link=='logit')){
|
if( (truth_family$family=="binomial") & (truth_family$link=='logit'))
|
||||||
ll.x.obs <- vector(mode='numeric',length=dim(truth.model.matrix)[1])
|
ll.x.obs <- likelihood.logistic(truth.params, truth.obs, truth.model.matrix)
|
||||||
|
|
||||||
# truth_formula likelihood using logistic regression
|
# add the three likelihoods
|
||||||
ll.x.obs[truth.obs==1] <- plogis(truth.params %*% t(truth.model.matrix[truth.obs==1,]),log=TRUE)
|
|
||||||
ll.x.obs[truth.obs==0] <- plogis(truth.params %*% t(truth.model.matrix[truth.obs==0,]),log=TRUE, lower.tail=FALSE)
|
|
||||||
}
|
|
||||||
|
|
||||||
# add the three likelihoods
|
|
||||||
ll.obs <- sum(ll.y.obs + ll.w.obs + ll.x.obs)
|
ll.obs <- sum(ll.y.obs + ll.w.obs + ll.x.obs)
|
||||||
|
|
||||||
## likelihood for the predicted data
|
## likelihood for the predicted data
|
||||||
## integrate out the "truth" variable.
|
## integrate out the "truth" variable.
|
||||||
|
|
||||||
if(truth_family$family=='binomial'){
|
if(truth_family$family=='binomial'){
|
||||||
df.unobs <- df[is.na(eval(parse(text=truth.variable)))]
|
|
||||||
df.unobs.x1 <- copy(df.unobs)
|
|
||||||
df.unobs.x1[,'x'] <- 1
|
|
||||||
df.unobs.x0 <- copy(df.unobs)
|
|
||||||
df.unobs.x0[,'x'] <- 0
|
|
||||||
outcome.unobs <- with(df.unobs, eval(parse(text=response.var)))
|
|
||||||
|
|
||||||
outcome.model.matrix.x0 <- model.matrix(outcome_formula, df.unobs.x0)
|
|
||||||
outcome.model.matrix.x1 <- model.matrix(outcome_formula, df.unobs.x1)
|
|
||||||
if(outcome_family$family=="gaussian"){
|
if(outcome_family$family=="gaussian"){
|
||||||
|
|
||||||
# likelihood of outcome
|
# likelihood of outcome
|
||||||
ll.y.x0 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x0), sd=sigma.y, log=TRUE)
|
ll.y.x0 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x0), sd=sigma.y, log=TRUE)
|
||||||
ll.y.x1 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x1), sd=sigma.y, log=TRUE)
|
ll.y.x1 <- dnorm(outcome.unobs, outcome.params %*% t(outcome.model.matrix.x1), sd=sigma.y, log=TRUE)
|
||||||
|
} else if( (outcome_family$family == "binomial") & (outcome_family$link == "logit") ){
|
||||||
|
ll.y.x1 <- likelihood.logistic(outcome.params, outcome.unobs, outcome.model.matrix.x1)
|
||||||
|
ll.y.x0 <- likelihood.logistic(outcome.params, outcome.unobs, outcome.model.matrix.x0)
|
||||||
}
|
}
|
||||||
|
|
||||||
if( (proxy_family$family=='binomial') & (proxy_family$link=='logit')){
|
if( (proxy_family$family=='binomial') & (proxy_family$link=='logit')){
|
||||||
|
|
||||||
proxy.model.matrix.x0 <- model.matrix(proxy_formula, df.unobs.x0)
|
ll.w.x0 <- likelihood.logistic(proxy.params, proxy.unobs, proxy.model.matrix.x0)
|
||||||
proxy.model.matrix.x1 <- model.matrix(proxy_formula, df.unobs.x1)
|
ll.w.x1 <- likelihood.logistic(proxy.params, proxy.unobs, proxy.model.matrix.x1)
|
||||||
proxy.unobs <- df.unobs[[proxy.variable]]
|
|
||||||
ll.w.x0 <- vector(mode='numeric', length=dim(df.unobs)[1])
|
|
||||||
ll.w.x1 <- vector(mode='numeric', length=dim(df.unobs)[1])
|
|
||||||
|
|
||||||
# likelihood of proxy
|
|
||||||
ll.w.x0[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.x0[proxy.unobs==1,]), log=TRUE)
|
|
||||||
ll.w.x1[proxy.unobs==1] <- plogis(proxy.params %*% t(proxy.model.matrix.x1[proxy.unobs==1,]), log=TRUE)
|
|
||||||
|
|
||||||
ll.w.x0[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.x0[proxy.unobs==0,]), log=TRUE,lower.tail=FALSE)
|
|
||||||
ll.w.x1[proxy.unobs==0] <- plogis(proxy.params %*% t(proxy.model.matrix.x1[proxy.unobs==0,]), log=TRUE,lower.tail=FALSE)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(truth_family$link=='logit'){
|
if(truth_family$link=='logit'){
|
||||||
truth.model.matrix <- model.matrix(truth_formula, df.unobs.x0)
|
|
||||||
# likelihood of truth
|
# likelihood of truth
|
||||||
ll.x.x1 <- plogis(truth.params %*% t(truth.model.matrix), log=TRUE)
|
ll.x.x1 <- plogis(truth.params %*% t(truth.model.matrix.unobs), log=TRUE)
|
||||||
ll.x.x0 <- plogis(truth.params %*% t(truth.model.matrix), log=TRUE, lower.tail=FALSE)
|
ll.x.x0 <- plogis(truth.params %*% t(truth.model.matrix.unobs), log=TRUE, lower.tail=FALSE)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,7 +240,7 @@ measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_fo
|
|||||||
names(start) <- params
|
names(start) <- params
|
||||||
|
|
||||||
if(method=='optim'){
|
if(method=='optim'){
|
||||||
fit <- optim(start, fn = measerr_mle_nll, lower=lower, method='L-BFGS-B', hessian=TRUE, control=list(maxit=1e6))
|
fit <- optim(start, fn = measerr_mle_nll, lower=lower, method=optim_method, hessian=TRUE, control=list(maxit=maxit))
|
||||||
} else { # method='mle2'
|
} else { # method='mle2'
|
||||||
|
|
||||||
quoted.names <- gsub("[\\(\\)]",'',names(start))
|
quoted.names <- gsub("[\\(\\)]",'',names(start))
|
||||||
@@ -253,7 +250,7 @@ measerr_mle <- function(df, outcome_formula, outcome_family=gaussian(), proxy_fo
|
|||||||
measerr_mle_nll_mle <- eval(parse(text=text))
|
measerr_mle_nll_mle <- eval(parse(text=text))
|
||||||
names(start) <- quoted.names
|
names(start) <- quoted.names
|
||||||
names(lower) <- quoted.names
|
names(lower) <- quoted.names
|
||||||
fit <- mle2(minuslogl=measerr_mle_nll_mle, start=start, lower=lower, parnames=params,control=list(maxit=1e6),method='L-BFGS-B')
|
fit <- mle2(minuslogl=measerr_mle_nll_mle, start=start, lower=lower, parnames=params,control=list(maxit=maxit),method=optim_method)
|
||||||
}
|
}
|
||||||
|
|
||||||
return(fit)
|
return(fit)
|
||||||
@@ -431,7 +428,7 @@ measerr_irr_mle <- function(df, outcome_formula, outcome_family=gaussian(), code
|
|||||||
## Experimental, and does not work.
|
## Experimental, and does not work.
|
||||||
measerr_irr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), coder_formulas=list(y.obs.0~y+w_pred+y.obs.1,y.obs.1~y+w_pred+y.obs.0), proxy_formula=w_pred~y, proxy_family=binomial(link='logit'),method='optim'){
|
measerr_irr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link='logit'), coder_formulas=list(y.obs.0~y+w_pred+y.obs.1,y.obs.1~y+w_pred+y.obs.0), proxy_formula=w_pred~y, proxy_family=binomial(link='logit'),method='optim'){
|
||||||
integrate.grid <- expand.grid(replicate(1 + length(coder_formulas), c(0,1), simplify=FALSE))
|
integrate.grid <- expand.grid(replicate(1 + length(coder_formulas), c(0,1), simplify=FALSE))
|
||||||
print(integrate.grid)
|
# print(integrate.grid)
|
||||||
|
|
||||||
|
|
||||||
outcome.model.matrix <- model.matrix(outcome_formula, df)
|
outcome.model.matrix <- model.matrix(outcome_formula, df)
|
||||||
@@ -527,8 +524,8 @@ measerr_irr_mle_dv <- function(df, outcome_formula, outcome_family=binomial(link
|
|||||||
|
|
||||||
## likelihood of observed data
|
## likelihood of observed data
|
||||||
target <- -1 * sum(lls)
|
target <- -1 * sum(lls)
|
||||||
print(target)
|
# print(target)
|
||||||
print(params)
|
# print(params)
|
||||||
return(target)
|
return(target)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
86
simulations/pl_methods.R
Normal file
86
simulations/pl_methods.R
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
library(stats4)
|
||||||
|
library(bbmle)
|
||||||
|
library(matrixStats)
|
||||||
|
|
||||||
|
zhang.mle.dv <- function(df){
|
||||||
|
df.obs <- df[!is.na(y.obs)]
|
||||||
|
df.unobs <- df[is.na(y.obs)]
|
||||||
|
|
||||||
|
fp <- df.obs[(w_pred==1) & (y.obs != w_pred),.N]
|
||||||
|
tn <- df.obs[(w_pred == 0) & (y.obs == w_pred),.N]
|
||||||
|
fpr <- fp / (fp+tn)
|
||||||
|
|
||||||
|
fn <- df.obs[(w_pred==0) & (y.obs != w_pred), .N]
|
||||||
|
tp <- df.obs[(w_pred==1) & (y.obs == w_pred),.N]
|
||||||
|
fnr <- fn / (fn+tp)
|
||||||
|
|
||||||
|
nll <- function(B0=0, Bxy=0, Bzy=0){
|
||||||
|
|
||||||
|
|
||||||
|
## observed case
|
||||||
|
ll.y.obs <- vector(mode='numeric', length=nrow(df.obs))
|
||||||
|
ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T))
|
||||||
|
ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE))
|
||||||
|
|
||||||
|
ll <- sum(ll.y.obs)
|
||||||
|
|
||||||
|
pi.y.1 <- with(df.unobs,plogis(B0 + Bxy * x + Bzy*z, log=T))
|
||||||
|
#pi.y.0 <- with(df.unobs,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE))
|
||||||
|
|
||||||
|
lls <- with(df.unobs, colLogSumExps(rbind(w_pred * colLogSumExps(rbind(log(fpr), log(1 - fnr - fpr)+pi.y.1)),
|
||||||
|
(1-w_pred) * (log(1-fpr) - exp(log(1-fnr-fpr)+pi.y.1)))))
|
||||||
|
|
||||||
|
ll <- ll + sum(lls)
|
||||||
|
# print(paste0(B0,Bxy,Bzy))
|
||||||
|
# print(ll)
|
||||||
|
return(-ll)
|
||||||
|
}
|
||||||
|
mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=c(B0=-Inf, Bxy=-Inf, Bzy=-Inf),
|
||||||
|
upper=c(B0=Inf, Bxy=Inf, Bzy=Inf))
|
||||||
|
return(mlefit)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
## model from Zhang's arxiv paper, with predictions for y
|
||||||
|
## Zhang got this model from Hausman 1998
|
||||||
|
zhang.mle.iv <- function(df){
|
||||||
|
df.obs <- df[!is.na(x.obs)]
|
||||||
|
df.unobs <- df[is.na(x.obs)]
|
||||||
|
|
||||||
|
tn <- df.obs[(w_pred == 0) & (x.obs == w_pred),.N]
|
||||||
|
fn <- df.obs[(w_pred==0) & (x.obs==1), .N]
|
||||||
|
npv <- tn / (tn + fn)
|
||||||
|
|
||||||
|
|
||||||
|
tp <- df.obs[(w_pred==1) & (x.obs == w_pred),.N]
|
||||||
|
fp <- df.obs[(w_pred==1) & (x.obs == 0),.N]
|
||||||
|
ppv <- tp / (tp + fp)
|
||||||
|
|
||||||
|
nll <- function(B0=0, Bxy=0, Bzy=0, sigma_y=9){
|
||||||
|
|
||||||
|
## fpr = 1 - TNR
|
||||||
|
### Problem: accounting for uncertainty in ppv / npv
|
||||||
|
|
||||||
|
## fnr = 1 - TPR
|
||||||
|
ll.y.obs <- with(df.obs, dnorm(y, B0 + Bxy * x + Bzy * z, sd=sigma_y,log=T))
|
||||||
|
|
||||||
|
ll <- sum(ll.y.obs)
|
||||||
|
# unobserved case; integrate out x
|
||||||
|
ll.x.1 <- with(df.unobs, dnorm(y, B0 + Bxy + Bzy * z, sd = sigma_y, log=T))
|
||||||
|
ll.x.0 <- with(df.unobs, dnorm(y, B0 + Bzy * z, sd = sigma_y,log=T))
|
||||||
|
|
||||||
|
## case x == 1
|
||||||
|
lls.x.1 <- colLogSumExps(rbind(log(ppv) + ll.x.1, log(1-ppv) + ll.x.0))
|
||||||
|
|
||||||
|
## case x == 0
|
||||||
|
lls.x.0 <- colLogSumExps(rbind(log(1-npv) + ll.x.1, log(npv) + ll.x.0))
|
||||||
|
|
||||||
|
lls <- colLogSumExps(rbind(df.unobs$w_pred * lls.x.1, (1-df.unobs$w_pred) * lls.x.0))
|
||||||
|
|
||||||
|
ll <- ll + sum(lls)
|
||||||
|
|
||||||
|
}
|
||||||
|
mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6), lower=list(sigma_y=0.00001, B0=-Inf, Bxy=-Inf, Bzy=-Inf),
|
||||||
|
upper=list(sigma_y=Inf, B0=Inf, Bxy=Inf, Bzy=Inf),method='L-BFGS-B')
|
||||||
|
return(mlefit)
|
||||||
|
}
|
||||||
@@ -6,7 +6,7 @@ library(filelock)
|
|||||||
library(argparser)
|
library(argparser)
|
||||||
|
|
||||||
parser <- arg_parser("Simulate data and fit corrected models.")
|
parser <- arg_parser("Simulate data and fit corrected models.")
|
||||||
parser <- add_argument(parser, "--infile", default="example_4.feather", help="name of the file to read.")
|
parser <- add_argument(parser, "--infile", default="robustness_3_dv.feather", help="name of the file to read.")
|
||||||
parser <- add_argument(parser, "--remember-file", default="remembr.RDS", help="name of the remember file.")
|
parser <- add_argument(parser, "--remember-file", default="remembr.RDS", help="name of the remember file.")
|
||||||
parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
|
parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
|
||||||
args <- parse_args(parser)
|
args <- parse_args(parser)
|
||||||
@@ -86,9 +86,6 @@ build_plot_dataset <- function(df){
|
|||||||
|
|
||||||
change.remember.file(args$remember_file, clear=TRUE)
|
change.remember.file(args$remember_file, clear=TRUE)
|
||||||
sims.df <- read_feather(args$infile)
|
sims.df <- read_feather(args$infile)
|
||||||
sims.df[,Bzx:=NA]
|
|
||||||
sims.df[,y_explained_variance:=NA]
|
|
||||||
sims.df[,accuracy_imbalance_difference:=NA]
|
|
||||||
plot.df <- build_plot_dataset(sims.df)
|
plot.df <- build_plot_dataset(sims.df)
|
||||||
|
|
||||||
remember(plot.df,args$name)
|
remember(plot.df,args$name)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ source("summarize_estimator.R")
|
|||||||
|
|
||||||
|
|
||||||
parser <- arg_parser("Simulate data and fit corrected models.")
|
parser <- arg_parser("Simulate data and fit corrected models.")
|
||||||
parser <- add_argument(parser, "--infile", default="example_2.feather", help="name of the file to read.")
|
parser <- add_argument(parser, "--infile", default="robustness_2.feather", help="name of the file to read.")
|
||||||
parser <- add_argument(parser, "--remember-file", default="remembr.RDS", help="name of the remember file.")
|
parser <- add_argument(parser, "--remember-file", default="remembr.RDS", help="name of the remember file.")
|
||||||
parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
|
parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
|
||||||
args <- parse_args(parser)
|
args <- parse_args(parser)
|
||||||
@@ -47,7 +47,7 @@ args <- parse_args(parser)
|
|||||||
## var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
|
## var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
|
||||||
## est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.95,na.rm=T),
|
## est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.95,na.rm=T),
|
||||||
## est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.05,na.rm=T),
|
## est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.05,na.rm=T),
|
||||||
## N.sims = .N,
|
## N.sims = .
|
||||||
## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
|
||||||
## variable=coefname,
|
## variable=coefname,
|
||||||
## method=suffix
|
## method=suffix
|
||||||
|
|||||||
39
simulations/robustness_check_notes.md
Normal file
39
simulations/robustness_check_notes.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# robustness\_1.RDS
|
||||||
|
|
||||||
|
Tests how robust the MLE method for independent variables with differential error is when the model for $X$ is less precise. In the main paper, we include $Z$ on the right-hand-side of the `truth_formula`.
|
||||||
|
In this robustness check, the `truth_formula` is an intercept-only model.
|
||||||
|
The stats are in the list named `robustness_1` in the `.RDS`
|
||||||
|
# robustness\_1\_dv.RDS
|
||||||
|
|
||||||
|
Like `robustness\_1.RDS` but with a less precise model for $w_pred$. In the main paper, we included $Z$ in the `proxy_formula`. In this robustness check, we do not.
|
||||||
|
|
||||||
|
# robustness_2.RDS
|
||||||
|
|
||||||
|
This is just example 1 with varying levels of classifier accuracy indicated by the `prediction_accuracy` variable..
|
||||||
|
|
||||||
|
# robustness_2_dv.RDS
|
||||||
|
|
||||||
|
Example 3 with varying levels of classifier accuracy indicated by the `prediction_accuracy` variable.
|
||||||
|
|
||||||
|
# robustness_3.RDS
|
||||||
|
|
||||||
|
Example 1 with varying levels of skewness in the classified variable. The variable `Px` is the baserate of $X$ and controls the skewness of $X$.
|
||||||
|
It probably makes more sense to report the mean of $X$ instead of `Px` in the supplement.
|
||||||
|
|
||||||
|
# robustness_3_dv.RDS
|
||||||
|
|
||||||
|
Example 3 with varying levels of skewness in the classified variable. The variable `B0` is the intercept of the main model and controls the skewness of $Y$.
|
||||||
|
It probably makes more sense to report the mean of $Y$ instead of B0 in the supplement.
|
||||||
|
|
||||||
|
# robustness_4.RDS
|
||||||
|
|
||||||
|
Example 2 with varying amounts of differential error. The variable `y_bias` controls the amount of differential error.
|
||||||
|
It probably makes more sense to report the corrleation between $Y$ and $X-~$, or the difference in accuracy from when when $Y=1$ to $Y=0$ in the supplement instead of `y_bias`.
|
||||||
|
|
||||||
|
# robustness_4_dv.RDS
|
||||||
|
|
||||||
|
Example 4 with varying amounts of bias. The variable `z_bias` controls the amount of differential error.
|
||||||
|
It probably makes more sense to report the corrleation between $Z$ and $Y-W$, or the difference in accuracy from when when $Z=1$ to $Z=0$ in the supplement instead of `z_bias`.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
18
simulations/run_job.sbatch
Normal file
18
simulations/run_job.sbatch
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#SBATCH --job-name="simulate measurement error models"
|
||||||
|
## Allocation Definition
|
||||||
|
#SBATCH --account=comdata-ckpt
|
||||||
|
#SBATCH --partition=ckpt
|
||||||
|
## Resources
|
||||||
|
#SBATCH --nodes=1
|
||||||
|
## Walltime (4 hours)
|
||||||
|
#SBATCH --time=4:00:00
|
||||||
|
## Memory per node
|
||||||
|
#SBATCH --mem=4G
|
||||||
|
#SBATCH --cpus-per-task=1
|
||||||
|
#SBATCH --ntasks-per-node=1
|
||||||
|
#SBATCH --chdir /gscratch/comdata/users/nathante/ml_measurement_error_public/simulations
|
||||||
|
#SBATCH --output=simulation_jobs/%A_%a.out
|
||||||
|
#SBATCH --error=simulation_jobs/%A_%a.err
|
||||||
|
echo "$@"
|
||||||
|
"$@"
|
||||||
@@ -89,7 +89,7 @@ my.mle <- function(df){
|
|||||||
return(mlefit)
|
return(mlefit)
|
||||||
}
|
}
|
||||||
|
|
||||||
run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~y){
|
run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~y, confint_method='quad'){
|
||||||
|
|
||||||
(accuracy <- df[,mean(w_pred==y)])
|
(accuracy <- df[,mean(w_pred==y)])
|
||||||
result <- append(result, list(accuracy=accuracy))
|
result <- append(result, list(accuracy=accuracy))
|
||||||
@@ -150,11 +150,13 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu
|
|||||||
|
|
||||||
temp.df <- copy(df)
|
temp.df <- copy(df)
|
||||||
temp.df[,y:=y.obs]
|
temp.df[,y:=y.obs]
|
||||||
|
|
||||||
mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula)
|
mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula)
|
||||||
fisher.info <- solve(mod.caroll.lik$hessian)
|
fischer.info <- solve(mod.caroll.lik$hessian)
|
||||||
coef <- mod.caroll.lik$par
|
coef <- mod.caroll.lik$par
|
||||||
ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96
|
ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
|
||||||
ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96
|
ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
|
||||||
|
|
||||||
result <- append(result,
|
result <- append(result,
|
||||||
list(Bxy.est.mle = coef['x'],
|
list(Bxy.est.mle = coef['x'],
|
||||||
Bxy.ci.upper.mle = ci.upper['x'],
|
Bxy.ci.upper.mle = ci.upper['x'],
|
||||||
@@ -163,6 +165,19 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu
|
|||||||
Bzy.ci.upper.mle = ci.upper['z'],
|
Bzy.ci.upper.mle = ci.upper['z'],
|
||||||
Bzy.ci.lower.mle = ci.lower['z']))
|
Bzy.ci.lower.mle = ci.lower['z']))
|
||||||
|
|
||||||
|
mod.caroll.profile.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, method='bbmle')
|
||||||
|
coef <- coef(mod.caroll.profile.lik)
|
||||||
|
ci <- confint(mod.caroll.profile.lik, method='spline')
|
||||||
|
ci.lower <- ci[,'2.5 %']
|
||||||
|
ci.upper <- ci[,'97.5 %']
|
||||||
|
|
||||||
|
result <- append(result,
|
||||||
|
list(Bxy.est.mle.profile = coef['x'],
|
||||||
|
Bxy.ci.upper.mle.profile = ci.upper['x'],
|
||||||
|
Bxy.ci.lower.mle.profile = ci.lower['x'],
|
||||||
|
Bzy.est.mle.profile = coef['z'],
|
||||||
|
Bzy.ci.upper.mle.profile = ci.upper['z'],
|
||||||
|
Bzy.ci.lower.mle.profile = ci.lower['z']))
|
||||||
|
|
||||||
## my implementatoin of liklihood based correction
|
## my implementatoin of liklihood based correction
|
||||||
mod.zhang <- zhang.mle.dv(df)
|
mod.zhang <- zhang.mle.dv(df)
|
||||||
@@ -180,26 +195,35 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu
|
|||||||
|
|
||||||
|
|
||||||
# amelia says use normal distribution for binary variables.
|
# amelia says use normal distribution for binary variables.
|
||||||
|
amelia_result <- list(Bxy.est.amelia.full = NA,
|
||||||
|
Bxy.ci.upper.amelia.full = NA,
|
||||||
|
Bxy.ci.lower.amelia.full = NA,
|
||||||
|
Bzy.est.amelia.full = NA,
|
||||||
|
Bzy.ci.upper.amelia.full = NA,
|
||||||
|
Bzy.ci.lower.amelia.full = NA
|
||||||
|
)
|
||||||
|
|
||||||
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w'))
|
tryCatch({
|
||||||
mod.amelia.k <- zelig(y.obs~x+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w'),ords="y.obs")
|
||||||
(coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
mod.amelia.k <- zelig(y.obs~x+z, model='logit', data=amelia.out.k$imputations, cite=FALSE)
|
||||||
est.x.mi <- coefse['x','Estimate']
|
(coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
||||||
est.x.se <- coefse['x','Std.Error']
|
est.x.mi <- coefse['x','Estimate']
|
||||||
result <- append(result,
|
est.x.se <- coefse['x','Std.Error']
|
||||||
list(Bxy.est.amelia.full = est.x.mi,
|
|
||||||
|
est.z.mi <- coefse['z','Estimate']
|
||||||
|
est.z.se <- coefse['z','Std.Error']
|
||||||
|
amelia_result <- list(Bxy.est.amelia.full = est.x.mi,
|
||||||
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||||
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se,
|
||||||
))
|
Bzy.est.amelia.full = est.z.mi,
|
||||||
|
|
||||||
est.z.mi <- coefse['z','Estimate']
|
|
||||||
est.z.se <- coefse['z','Std.Error']
|
|
||||||
|
|
||||||
result <- append(result,
|
|
||||||
list(Bzy.est.amelia.full = est.z.mi,
|
|
||||||
Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
|
Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
|
||||||
Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
|
Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
|
||||||
))
|
)
|
||||||
|
},
|
||||||
|
error = function(e){
|
||||||
|
result[['error']] <- e}
|
||||||
|
)
|
||||||
|
result <- append(result,amelia_result)
|
||||||
|
|
||||||
return(result)
|
return(result)
|
||||||
|
|
||||||
@@ -207,7 +231,7 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu
|
|||||||
|
|
||||||
|
|
||||||
## outcome_formula, proxy_formula, and truth_formula are passed to measerr_mle
|
## outcome_formula, proxy_formula, and truth_formula are passed to measerr_mle
|
||||||
run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NULL, truth_formula=NULL){
|
run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NULL, truth_formula=NULL, confint_method='quad'){
|
||||||
|
|
||||||
accuracy <- df[,mean(w_pred==x)]
|
accuracy <- df[,mean(w_pred==x)]
|
||||||
accuracy.y0 <- df[y<=0,mean(w_pred==x)]
|
accuracy.y0 <- df[y<=0,mean(w_pred==x)]
|
||||||
@@ -272,57 +296,119 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL
|
|||||||
Bzy.ci.upper.naive = naive.ci.Bzy[2],
|
Bzy.ci.upper.naive = naive.ci.Bzy[2],
|
||||||
Bzy.ci.lower.naive = naive.ci.Bzy[1]))
|
Bzy.ci.lower.naive = naive.ci.Bzy[1]))
|
||||||
|
|
||||||
|
amelia_result <- list(
|
||||||
|
Bxy.est.amelia.full = NULL,
|
||||||
|
Bxy.ci.upper.amelia.full = NULL,
|
||||||
|
Bxy.ci.lower.amelia.full = NULL,
|
||||||
|
Bzy.est.amelia.full = NULL,
|
||||||
|
Bzy.ci.upper.amelia.full = NULL,
|
||||||
|
Bzy.ci.lower.amelia.full = NULL
|
||||||
|
)
|
||||||
|
|
||||||
|
tryCatch({
|
||||||
|
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
|
||||||
|
mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
||||||
|
(coefse <- combine_coef_se(mod.amelia.k))
|
||||||
|
|
||||||
|
est.x.mi <- coefse['x.obs','Estimate']
|
||||||
|
est.x.se <- coefse['x.obs','Std.Error']
|
||||||
|
est.z.mi <- coefse['z','Estimate']
|
||||||
|
est.z.se <- coefse['z','Std.Error']
|
||||||
|
|
||||||
|
amelia_result <- list(Bxy.est.amelia.full = est.x.mi,
|
||||||
|
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
||||||
|
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se,
|
||||||
|
Bzy.est.amelia.full = est.z.mi,
|
||||||
|
Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
|
||||||
|
Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
|
||||||
|
)
|
||||||
|
|
||||||
|
},
|
||||||
|
|
||||||
|
error = function(e){
|
||||||
|
result[['error']] <- e}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w'))
|
result <- append(result, amelia_result)
|
||||||
mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE)
|
|
||||||
(coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
|
|
||||||
|
|
||||||
est.x.mi <- coefse['x.obs','Estimate']
|
|
||||||
est.x.se <- coefse['x.obs','Std.Error']
|
|
||||||
result <- append(result,
|
|
||||||
list(Bxy.est.amelia.full = est.x.mi,
|
|
||||||
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
|
|
||||||
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
|
|
||||||
))
|
|
||||||
|
|
||||||
est.z.mi <- coefse['z','Estimate']
|
|
||||||
est.z.se <- coefse['z','Std.Error']
|
|
||||||
|
|
||||||
result <- append(result,
|
|
||||||
list(Bzy.est.amelia.full = est.z.mi,
|
|
||||||
Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se,
|
|
||||||
Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se
|
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
|
mle_result <- list(Bxy.est.mle = NULL,
|
||||||
|
Bxy.ci.upper.mle = NULL,
|
||||||
|
Bxy.ci.lower.mle = NULL,
|
||||||
|
Bzy.est.mle = NULL,
|
||||||
|
Bzy.ci.upper.mle = NULL,
|
||||||
|
Bzy.ci.lower.mle = NULL)
|
||||||
|
|
||||||
|
tryCatch({
|
||||||
temp.df <- copy(df)
|
temp.df <- copy(df)
|
||||||
temp.df <- temp.df[,x:=x.obs]
|
temp.df <- temp.df[,x:=x.obs]
|
||||||
mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula)
|
mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula, method='optim')
|
||||||
fisher.info <- solve(mod.caroll.lik$hessian)
|
fischer.info <- solve(mod.caroll.lik$hessian)
|
||||||
coef <- mod.caroll.lik$par
|
coef <- mod.caroll.lik$par
|
||||||
ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96
|
ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96
|
||||||
ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96
|
ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96
|
||||||
|
|
||||||
|
mle_result <- list(Bxy.est.mle = coef['x'],
|
||||||
|
Bxy.ci.upper.mle = ci.upper['x'],
|
||||||
|
Bxy.ci.lower.mle = ci.lower['x'],
|
||||||
|
Bzy.est.mle = coef['z'],
|
||||||
|
Bzy.ci.upper.mle = ci.upper['z'],
|
||||||
|
Bzy.ci.lower.mle = ci.lower['z'])
|
||||||
|
},
|
||||||
|
error=function(e) {result[['error']] <- as.character(e)
|
||||||
|
})
|
||||||
|
|
||||||
result <- append(result,
|
result <- append(result, mle_result)
|
||||||
list(Bxy.est.mle = coef['x'],
|
mle_result_proflik <- list(Bxy.est.mle.profile = NULL,
|
||||||
Bxy.ci.upper.mle = ci.upper['x'],
|
Bxy.ci.upper.mle.profile = NULL,
|
||||||
Bxy.ci.lower.mle = ci.lower['x'],
|
Bxy.ci.lower.mle.profile = NULL,
|
||||||
Bzy.est.mle = coef['z'],
|
Bzy.est.mle.profile = NULL,
|
||||||
Bzy.ci.upper.mle = ci.upper['z'],
|
Bzy.ci.upper.mle.profile = NULL,
|
||||||
Bzy.ci.lower.mle = ci.lower['z']))
|
Bzy.ci.lower.mle.profile = NULL)
|
||||||
|
|
||||||
mod.zhang.lik <- zhang.mle.iv(df)
|
tryCatch({
|
||||||
coef <- coef(mod.zhang.lik)
|
## confint_method == 'bbmle'
|
||||||
ci <- confint(mod.zhang.lik,method='quad')
|
mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula, method='bbmle')
|
||||||
result <- append(result,
|
coef <- coef(mod.caroll.lik)
|
||||||
list(Bxy.est.zhang = coef['Bxy'],
|
ci <- confint(mod.caroll.lik, method='spline')
|
||||||
Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
|
ci.lower <- ci[,'2.5 %']
|
||||||
Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
|
ci.upper <- ci[,'97.5 %']
|
||||||
Bzy.est.zhang = coef['Bzy'],
|
|
||||||
Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
|
mle_result_proflik <- list(Bxy.est.mle.profile = coef['x'],
|
||||||
Bzy.ci.lower.zhang = ci['Bzy','2.5 %']))
|
Bxy.ci.upper.mle.profile = ci.upper['x'],
|
||||||
|
Bxy.ci.lower.mle.profile = ci.lower['x'],
|
||||||
|
Bzy.est.mle.profile = coef['z'],
|
||||||
|
Bzy.ci.upper.mle.profile = ci.upper['z'],
|
||||||
|
Bzy.ci.lower.mle.profile = ci.lower['z'])
|
||||||
|
},
|
||||||
|
|
||||||
|
error=function(e) {result[['error']] <- as.character(e)
|
||||||
|
})
|
||||||
|
|
||||||
|
result <- append(result, mle_result_proflik)
|
||||||
|
|
||||||
|
zhang_result <- list(Bxy.est.mle.zhang = NULL,
|
||||||
|
Bxy.ci.upper.mle.zhang = NULL,
|
||||||
|
Bxy.ci.lower.mle.zhang = NULL,
|
||||||
|
Bzy.est.mle.zhang = NULL,
|
||||||
|
Bzy.ci.upper.mle.zhang = NULL,
|
||||||
|
Bzy.ci.lower.mle.zhang = NULL)
|
||||||
|
|
||||||
|
tryCatch({
|
||||||
|
mod.zhang.lik <- zhang.mle.iv(df)
|
||||||
|
coef <- coef(mod.zhang.lik)
|
||||||
|
ci <- confint(mod.zhang.lik,method='quad')
|
||||||
|
zhang_result <- list(Bxy.est.zhang = coef['Bxy'],
|
||||||
|
Bxy.ci.upper.zhang = ci['Bxy','97.5 %'],
|
||||||
|
Bxy.ci.lower.zhang = ci['Bxy','2.5 %'],
|
||||||
|
Bzy.est.zhang = coef['Bzy'],
|
||||||
|
Bzy.ci.upper.zhang = ci['Bzy','97.5 %'],
|
||||||
|
Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])
|
||||||
|
},
|
||||||
|
error=function(e) {result[['error']] <- as.character(e)
|
||||||
|
})
|
||||||
|
result <- append(result, zhang_result)
|
||||||
|
|
||||||
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
|
||||||
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)
|
## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)
|
||||||
|
|||||||
@@ -1,18 +1,23 @@
|
|||||||
|
library(ggdist)
|
||||||
|
|
||||||
summarize.estimator <- function(df, suffix='naive', coefname='x'){
|
summarize.estimator <- function(sims.df, suffix='naive', coefname='x'){
|
||||||
|
|
||||||
part <- df[,c('N',
|
reported_vars <- c(
|
||||||
'm',
|
'Bxy',
|
||||||
'Bxy',
|
paste0('B',coefname,'y.est.',suffix),
|
||||||
paste0('B',coefname,'y.est.',suffix),
|
paste0('B',coefname,'y.ci.lower.',suffix),
|
||||||
paste0('B',coefname,'y.ci.lower.',suffix),
|
paste0('B',coefname,'y.ci.upper.',suffix)
|
||||||
paste0('B',coefname,'y.ci.upper.',suffix),
|
)
|
||||||
'y_explained_variance',
|
|
||||||
'Bzx',
|
|
||||||
'Bzy',
|
grouping_vars <- c('N','m','B0', 'Bxy', 'Bzy', 'Bzx', 'Px', 'Py','y_explained_variance', 'prediction_accuracy','outcome_formula','proxy_formula','truth_formula','z_bias','y_bias')
|
||||||
'accuracy_imbalance_difference'
|
|
||||||
),
|
grouping_vars <- grouping_vars[grouping_vars %in% names(df)]
|
||||||
with=FALSE]
|
|
||||||
|
part <- sims.df[,
|
||||||
|
unique(c(reported_vars,
|
||||||
|
grouping_vars)),
|
||||||
|
with=FALSE]
|
||||||
|
|
||||||
|
|
||||||
true.in.ci <- as.integer((part$Bxy >= part[[paste0('B',coefname,'y.ci.lower.',suffix)]]) & (part$Bxy <= part[[paste0('B',coefname,'y.ci.upper.',suffix)]]))
|
true.in.ci <- as.integer((part$Bxy >= part[[paste0('B',coefname,'y.ci.lower.',suffix)]]) & (part$Bxy <= part[[paste0('B',coefname,'y.ci.upper.',suffix)]]))
|
||||||
@@ -25,14 +30,17 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){
|
|||||||
bias=bias,
|
bias=bias,
|
||||||
sign.correct =sign.correct)]
|
sign.correct =sign.correct)]
|
||||||
|
|
||||||
|
|
||||||
part.plot <- part[, .(p.true.in.ci = mean(true.in.ci),
|
part.plot <- part[, .(p.true.in.ci = mean(true.in.ci),
|
||||||
mean.bias = mean(bias),
|
mean.bias = mean(bias),
|
||||||
mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
|
mean.est = mean(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
|
||||||
var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]]),
|
var.est = var(.SD[[paste0('B',coefname,'y.est.',suffix)]],na.rm=T),
|
||||||
est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.975,na.rm=T),
|
est.upper.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.975,na.rm=T),
|
||||||
est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.025,na.rm=T),
|
est.lower.95 = quantile(.SD[[paste0('B',coefname,'y.est.',suffix)]],0.025,na.rm=T),
|
||||||
mean.ci.upper = mean(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T),
|
mean.ci.upper = mean(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T),
|
||||||
mean.ci.lower = mean(.SD[[paste0('B',coefname,'y.ci.lower.',suffix)]],na.rm=T),
|
mean.ci.lower = mean(.SD[[paste0('B',coefname,'y.ci.lower.',suffix)]],na.rm=T),
|
||||||
|
median.ci.upper = median(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],na.rm=T),
|
||||||
|
median.ci.lower = median(.SD[[paste0('B',coefname,'y.ci.lower.',suffix)]],na.rm=T),
|
||||||
ci.upper.975 = quantile(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],0.975,na.rm=T),
|
ci.upper.975 = quantile(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],0.975,na.rm=T),
|
||||||
ci.upper.025 = quantile(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],0.025,na.rm=T),
|
ci.upper.025 = quantile(.SD[[paste0('B',coefname,'y.ci.upper.',suffix)]],0.025,na.rm=T),
|
||||||
ci.lower.975 = quantile(.SD[[paste0('B',coefname,'y.ci.lower.',suffix)]],0.975,na.rm=T),
|
ci.lower.975 = quantile(.SD[[paste0('B',coefname,'y.ci.lower.',suffix)]],0.975,na.rm=T),
|
||||||
@@ -43,7 +51,7 @@ summarize.estimator <- function(df, suffix='naive', coefname='x'){
|
|||||||
variable=coefname,
|
variable=coefname,
|
||||||
method=suffix
|
method=suffix
|
||||||
),
|
),
|
||||||
by=c("N","m",'y_explained_variance','Bzx', 'Bzy', 'accuracy_imbalance_difference')
|
by=grouping_vars,
|
||||||
]
|
]
|
||||||
|
|
||||||
return(part.plot)
|
return(part.plot)
|
||||||
|
|||||||
Reference in New Issue
Block a user