Copy files into public version.

sio # new file: charts/example_2_dag/Makefile
2022-05-26 01:32:07 -07:00
commit c7fa348220
41 changed files with 3460 additions and 0 deletions
--- a/simulations/.RData
+++ b/simulations/.RData
--- a/simulations/.Rhistory
+++ b/simulations/.Rhistory
@@ -0,0 +1,65 @@
+ls()
+weight
+weight
+lablr
+labelr
+nrow(labelr)
+names(labelr)
+names(labelr$data)
+labelr$data
+labelr
+names(labelr)
+labelr$labelr
+labelr$toxic
+setwd("..")
+q()
+n
+summary(w2)
+summary(w2)
+q()
+n
+summary(fit1)
+0.5*(dat$x1 + sapply(dat$sdx, function(sd) rnorm(1,0,sd)))
+summary(fit1)
+summary(fit2)
+summary(fit2)
+conditional_effects(fit2,resp='y')
+plot(conditional_effects(fit2,resp='y'))
+stancode(fit2)
+stancode(fit1)
+sessionInfo()
+q()
+y
+p.y
+range(p.y)
+rbinom
+df2
+df2
+df2
+brms.corrected.logit
+q()
+n
+summary(brms.corrected.logit)
+summary(brms.corrected.logit)
+p.y
+q()
+n
+mw
+summary(mw)
+)
+summary(true.model)
+true.model
+true.model$R
+true.model$null.deviance
+true.model$deviance
+getwd()
+setwd("../../)
+setwd("../../)
+
+setwd("../../partitioning_reddit")
+ls
+getwd()
+list.files()
+install.packages("filelock")
+q()
+n
--- a/simulations/mi_simulations/example_2_binary.R
+++ b/simulations/mi_simulations/example_2_binary.R
@@ -0,0 +1,169 @@
+### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
+### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
+### Even when you include the proxy variable in the regression.
+### But with some ground truth and multiple imputation, you can fix it.
+
+library(argparser)
+library(mecor)
+library(ggplot2)
+library(data.table)
+library(filelock)
+library(arrow)
+library(Amelia)
+library(Zelig)
+
+options(amelia.parallel="multicore",
+        amelia.ncpus=40)
+
+## SETUP:
+### we want to estimate g -> y and x -> y; g is observed, x is MAR
+### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
+### we have k -> w; x -> w; w is observed.
+### for illustration, g is binary (e.g., gender==male).
+### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
+### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
+### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
+### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
+### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
+### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race. 
+
+### simulation:
+#### how much power do we get from the model in the first place? (sweeping N and m)
+#### 
+logistic <- function(x) {1/(1+exp(-1*x))}
+
+simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    set.seed(seed)
+
+    ## the true value of x
+
+    g <- rbinom(N, 1, 0.5)
+    k <- rnorm(N, 0, 1)
+    xprime <- Bkx*k + Bgx * g
+    x <- rbinom(N, 1, logistic(xprime - mean(xprime)))
+    w.model <- glm(x ~ k,family='binomial')
+    w <- as.integer(predict(w.model,data.frame(k=k),type='response') > 0.5)
+    ## y = B0 + B1x + e
+    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0
+    df <- data.table(x=x,k=k,y=y,w=w,g=g)
+    if( m < N){
+        df <- df[sample(nrow(df), m), x.obs := x]
+    } else {
+        df <- df[, x.obs := x]
+    }
+
+    return(df)
+}
+
+
+run_simulation <-  function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
+    result <- list()
+    df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
+
+    result <- append(result, list(N=N,
+                                  m=m,
+                                  B0=B0,
+                                  Bxy=Bxy,
+                                  Bgy=Bgy,
+                                  Bkx=Bkx,
+                                  seed=seed))
+
+    accuracy <- df[,.(mean(w==x))]$V1
+    result <- append(result, list(accuracy=accuracy))
+
+    model.true <- lm(y ~ x + g, data=df)
+    true.ci.Bxy <- confint(model.true)['x',]
+    true.ci.Bgy <- confint(model.true)['g',]
+
+    result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
+                                  Bgy.est.true=coef(model.true)['g'],
+                                  Bxy.ci.upper.true = true.ci.Bxy[2],
+                                  Bxy.ci.lower.true = true.ci.Bxy[1],
+                                  Bgy.ci.upper.true = true.ci.Bgy[2],
+                                  Bgy.ci.lower.true = true.ci.Bgy[1]))
+                                  
+
+    model.naive <- lm(y~w+g, data=df)
+    
+    naive.ci.Bxy <- confint(model.naive)['w',]
+    naive.ci.Bgy <- confint(model.naive)['g',]
+
+    result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
+                                  Bgy.est.naive=coef(model.naive)['g'],
+                                  Bxy.ci.upper.naive = naive.ci.Bxy[2],
+                                  Bxy.ci.lower.naive = naive.ci.Bxy[1],
+                                  Bgy.ci.upper.naive = naive.ci.Bgy[2],
+                                  Bgy.ci.lower.naive = naive.ci.Bgy[1]))
+                                  
+
+    ## multiple imputation when k is observed
+    amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'),noms=c("x.obs","w","g"))
+    mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
+    coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
+
+    est.x.mi <- coefse['x.obs','Estimate']
+    est.x.se <- coefse['x.obs','Std.Error']
+    result <- append(result,
+                     list(Bxy.est.amelia.full = est.x.mi,
+                          Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
+                          Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
+                          ))
+
+    est.g.mi <- coefse['g','Estimate']
+    est.g.se <- coefse['g','Std.Error']
+
+    result <- append(result,
+                     list(Bgy.est.amelia.full = est.g.mi,
+                          Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
+                          Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
+                          ))
+
+    ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
+    amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs","w",'g'))
+    mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
+    coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
+
+    est.x.mi <- coefse['x.obs','Estimate']
+    est.x.se <- coefse['x.obs','Std.Error']
+    result <- append(result,
+                     list(Bxy.est.amelia.nok = est.x.mi,
+                          Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
+                          Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
+                          ))
+
+    est.g.mi <- coefse['g','Estimate']
+    est.g.se <- coefse['g','Std.Error']
+
+    result <- append(result,
+                     list(Bgy.est.amelia.nok = est.g.mi,
+                          Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
+                          Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
+                          ))
+
+    return(result)
+}
+
+Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
+ms <- c(30, 50, 100, 200, 300, 500)
+B0 <- 0
+Bxy <- 1
+Bgy <- 0.3
+Bkx <- 3
+Bgx <- -4
+seeds <- 1:100
+
+rows <- list()
+
+for(N in Ns){
+    print(N)
+    for(m in ms){
+        if(N>m){
+            for(seed in seeds){
+                rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
+            }
+        }
+    }
+}
+
+result <- rbindlist(rows)
+write_feather(result, "example_2_simulation.feather")
--- a/simulations/notes.org
+++ b/simulations/notes.org
@@ -0,0 +1,31 @@
+** Next simulations to run
+- Focus on the BKH *multiple imputation* approach --- We don't actually need overimputation when we have some gold standard data.
+- The idea of this paper is basically to say "Supervised learning fits well into this framework. So use it!"
+- Important findings will involve: 
+  - How well does this work when the learner is biased? 
+    Probably works great as long as you can account for the bias in imputation + regression
+  - How well does this work when features are unavailable? 
+    - if the 
+  - How well does this work when featues are available? 
+  - How well does this work when measurement error is correlated with the independent and dependent variable?
+  - How well does this work when there's a very large number of missing values?
+
+
+
+** How well does supervised ML measurement fit into BHK? 
+*** Unbiased proxy assumption:  If data is missing but there's a proxy variable that's unbiased we can use m=1 techniques instead of m=2 techniques.   
+
+M=1 techniques are simpler / more powerful, but maybe m=2 techniques are more robust given we can't assume supervised learners are unbiased. 
+
+That said, if we have access to the features, then we can use the predictions and the features as our proxy variable.  Since any bias in w will be correlated with the features, including the features in the likelihood will reduce the bias. 
+
+*** IMMA assumption: distribution of the mismeasurement indicator, m,is the same no matter the value of the missing data.
+
+This isn't a problem if ground truth is randomly sampled. 
+It is a problem if ground truth is based on a stratified sample.   
+
+*** Measurement error distribution assumption: The distribution of the proxy variable (conditional on missing and observed data and its distributional parameters) known up to its parameters. The parameters are either known or a consistent estimator is available.
+ 
+This isn't a problem if we have ground truth because we can use the ground truth to estimate the parameters.  
+
+If we don't have ground truth, we'll have to guess.