diff --git a/R/calculatePower.R b/R/calculatePower.R new file mode 100644 index 0000000..c51a1ca --- /dev/null +++ b/R/calculatePower.R @@ -0,0 +1,49 @@ +############################################################################## +# +# Purpose: +# Use pilot project data to calculate power of a full study through simulation +# +# Parts: +# (0) - Setup +# (1) - Get the pilot data and clean it +# (2) - Run the model on the pilot data and extract effects +# (3) - Set up and run the simulation +# ====> Set variables at the arrows <==== +# +############################################################################## +rm(list=ls()) +set.seed(424242) + +# (1) - Get the pilot data and clean it +source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R') +d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) +levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors") + +# (2) - Run the model on the pilot data +pilotM <- glm(nd ~ source, family=binomial(link="logit"), data=d) +summary(pilotM) #we expect effect sizes on this order + +pilot.b0 <- coef(summary(pilotM))[1,1] +pilot.b1 <- coef(summary(pilotM))[2,1] +pilot.b2 <- coef(summary(pilotM))[3,1] +pilot.b3 <- coef(summary(pilotM))[4,1] + + +# (3) - Set up and run the simulation + +source('powerAnalysis.R') #my little "lib" + +#====> +nSims <- 5000 #how many simulations to run +n <- 100 #a guess for necessary sample size (per group) +#makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work +#<==== + +print("Levels are:") +print(levels(d$source)) +powerCheck(n, nSims) + +#Sample values +powerCheck(50, 100) +powerCheck(80, 1000) +powerCheck(200, 5000) diff --git a/R/powerAnalysis.R b/R/powerAnalysis.R new file mode 100644 index 0000000..222032b --- /dev/null +++ b/R/powerAnalysis.R @@ -0,0 +1,54 @@ +# This is semi-generic code for doing a power analysis of a logistic regression with 4 +# levels in a factor +# when there's some pilot values already available and defined +#modelled heavily the simulation example explained in: +#http://meeting.spsp.org/2016/sites/default/files/Lane%2C%20Hennes%2C%20West%20SPSP%20Power%20Workshop%202016.pdf + +library('batman') +library('reshape') + +l2p <- function(b) { + odds <- exp(b) + prob <- odds/(1+odds) + return(prob) +} + + +makeData <- function(n) { #make a random dataset of size n + #4 group IDs + tDF <- data.frame( + Group0=rbinom(n=n, size=1, prob=l2p(pilot.b0)), #ASK: what about se in pilot data? + Group1=rbinom(n=n, size=1, prob=l2p(pilot.b0 + pilot.b1)), # shouldn't my probs + Group2=rbinom(n=n, size=1, prob=l2p(pilot.b0 + pilot.b2)), # include se? + Group3=rbinom(n=n, size=1, prob=l2p(pilot.b0 + pilot.b3))) + sDF <- melt(tDF, id.vars = 0) #AKA the index is the unique id, as far as that goes + colnames(sDF) <- c('source', 'nd') + + return(sDF) +} + +powerCheck <- function(n, nSims) { #run a power calculation on the dataset given + #set up some empty arrays b/c R + signif0 <- rep(NA, nSims) + signif1 <- rep(NA, nSims) + signif2 <- rep(NA, nSims) + signif3 <- rep(NA, nSims) + signifM <- rep(NA, nSims) + for (s in 1:nSims) { # repeatedly we will.... + simData <- makeData(n) # make some data + m1.sim <- glm(nd ~ source, # give the anticipated regression a try + family=binomial(link="logit"), data=simData) + p0 <- coef(summary(m1.sim))[1,4] + p1 <- coef(summary(m1.sim))[2,4] + p2 <- coef(summary(m1.sim))[3,4] + p3 <- coef(summary(m1.sim))[4,4] + signif0[s] <- p0 <=.05 + signif1[s] <- p1 <=.05 + signif2[s] <- p2 <=.05 + signif3[s] <- p3 <=.05 + signifM[s] <- p0 <=.05 & p1 <=.05 & p2 <=.05 & p3 <=.05 + } + power <- c(mean(signif0), mean(signif1), mean(signif2), mean(signif3), mean(signifM)) + return(power) +} +