1
0

Copy files into public version.

sio #	new file:   charts/example_2_dag/Makefile
This commit is contained in:
2022-05-26 01:32:07 -07:00
commit c7fa348220
41 changed files with 3460 additions and 0 deletions

BIN
simulations/.RData Normal file

Binary file not shown.

65
simulations/.Rhistory Normal file
View File

@@ -0,0 +1,65 @@
ls()
weight
weight
lablr
labelr
nrow(labelr)
names(labelr)
names(labelr$data)
labelr$data
labelr
names(labelr)
labelr$labelr
labelr$toxic
setwd("..")
q()
n
summary(w2)
summary(w2)
q()
n
summary(fit1)
0.5*(dat$x1 + sapply(dat$sdx, function(sd) rnorm(1,0,sd)))
summary(fit1)
summary(fit2)
summary(fit2)
conditional_effects(fit2,resp='y')
plot(conditional_effects(fit2,resp='y'))
stancode(fit2)
stancode(fit1)
sessionInfo()
q()
y
p.y
range(p.y)
rbinom
df2
df2
df2
brms.corrected.logit
q()
n
summary(brms.corrected.logit)
summary(brms.corrected.logit)
p.y
q()
n
mw
summary(mw)
)
summary(true.model)
true.model
true.model$R
true.model$null.deviance
true.model$deviance
getwd()
setwd("../../)
setwd("../../)
setwd("../../partitioning_reddit")
ls
getwd()
list.files()
install.packages("filelock")
q()
n

View File

@@ -0,0 +1,169 @@
### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
### Even when you include the proxy variable in the regression.
### But with some ground truth and multiple imputation, you can fix it.
library(argparser)
library(mecor)
library(ggplot2)
library(data.table)
library(filelock)
library(arrow)
library(Amelia)
library(Zelig)
options(amelia.parallel="multicore",
amelia.ncpus=40)
## SETUP:
### we want to estimate g -> y and x -> y; g is observed, x is MAR
### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
### we have k -> w; x -> w; w is observed.
### for illustration, g is binary (e.g., gender==male).
### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race.
### simulation:
#### how much power do we get from the model in the first place? (sweeping N and m)
####
logistic <- function(x) {1/(1+exp(-1*x))}
simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
set.seed(seed)
## the true value of x
g <- rbinom(N, 1, 0.5)
k <- rnorm(N, 0, 1)
xprime <- Bkx*k + Bgx * g
x <- rbinom(N, 1, logistic(xprime - mean(xprime)))
w.model <- glm(x ~ k,family='binomial')
w <- as.integer(predict(w.model,data.frame(k=k),type='response') > 0.5)
## y = B0 + B1x + e
y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
df <- data.table(x=x,k=k,y=y,w=w,g=g)
if( m < N){
df <- df[sample(nrow(df), m), x.obs := x]
} else {
df <- df[, x.obs := x]
}
return(df)
}
run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
result <- list()
df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
result <- append(result, list(N=N,
m=m,
B0=B0,
Bxy=Bxy,
Bgy=Bgy,
Bkx=Bkx,
seed=seed))
accuracy <- df[,.(mean(w==x))]$V1
result <- append(result, list(accuracy=accuracy))
model.true <- lm(y ~ x + g, data=df)
true.ci.Bxy <- confint(model.true)['x',]
true.ci.Bgy <- confint(model.true)['g',]
result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
Bgy.est.true=coef(model.true)['g'],
Bxy.ci.upper.true = true.ci.Bxy[2],
Bxy.ci.lower.true = true.ci.Bxy[1],
Bgy.ci.upper.true = true.ci.Bgy[2],
Bgy.ci.lower.true = true.ci.Bgy[1]))
model.naive <- lm(y~w+g, data=df)
naive.ci.Bxy <- confint(model.naive)['w',]
naive.ci.Bgy <- confint(model.naive)['g',]
result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
Bgy.est.naive=coef(model.naive)['g'],
Bxy.ci.upper.naive = naive.ci.Bxy[2],
Bxy.ci.lower.naive = naive.ci.Bxy[1],
Bgy.ci.upper.naive = naive.ci.Bgy[2],
Bgy.ci.lower.naive = naive.ci.Bgy[1]))
## multiple imputation when k is observed
amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'),noms=c("x.obs","w","g"))
mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.full = est.x.mi,
Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
))
est.g.mi <- coefse['g','Estimate']
est.g.se <- coefse['g','Std.Error']
result <- append(result,
list(Bgy.est.amelia.full = est.g.mi,
Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
))
## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs","w",'g'))
mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
est.x.mi <- coefse['x.obs','Estimate']
est.x.se <- coefse['x.obs','Std.Error']
result <- append(result,
list(Bxy.est.amelia.nok = est.x.mi,
Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
))
est.g.mi <- coefse['g','Estimate']
est.g.se <- coefse['g','Std.Error']
result <- append(result,
list(Bgy.est.amelia.nok = est.g.mi,
Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
))
return(result)
}
Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
ms <- c(30, 50, 100, 200, 300, 500)
B0 <- 0
Bxy <- 1
Bgy <- 0.3
Bkx <- 3
Bgx <- -4
seeds <- 1:100
rows <- list()
for(N in Ns){
print(N)
for(m in ms){
if(N>m){
for(seed in seeds){
rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
}
}
}
}
result <- rbindlist(rows)
write_feather(result, "example_2_simulation.feather")

31
simulations/notes.org Normal file
View File

@@ -0,0 +1,31 @@
** Next simulations to run
- Focus on the BKH *multiple imputation* approach --- We don't actually need overimputation when we have some gold standard data.
- The idea of this paper is basically to say "Supervised learning fits well into this framework. So use it!"
- Important findings will involve:
- How well does this work when the learner is biased?
Probably works great as long as you can account for the bias in imputation + regression
- How well does this work when features are unavailable?
- if the
- How well does this work when featues are available?
- How well does this work when measurement error is correlated with the independent and dependent variable?
- How well does this work when there's a very large number of missing values?
** How well does supervised ML measurement fit into BHK?
*** Unbiased proxy assumption: If data is missing but there's a proxy variable that's unbiased we can use m=1 techniques instead of m=2 techniques.
M=1 techniques are simpler / more powerful, but maybe m=2 techniques are more robust given we can't assume supervised learners are unbiased.
That said, if we have access to the features, then we can use the predictions and the features as our proxy variable. Since any bias in w will be correlated with the features, including the features in the likelihood will reduce the bias.
*** IMMA assumption: distribution of the mismeasurement indicator, m,is the same no matter the value of the missing data.
This isn't a problem if ground truth is randomly sampled.
It is a problem if ground truth is based on a stratified sample.
*** Measurement error distribution assumption: The distribution of the proxy variable (conditional on missing and observed data and its distributional parameters) known up to its parameters. The parameters are either known or a consistent estimator is available.
This isn't a problem if we have ground truth because we can use the ground truth to estimate the parameters.
If we don't have ground truth, we'll have to guess.