############################################################################## # # Purpose: # Use pilot project data to calculate power of a full study through simulation # # Parts: # (0) - Setup # (1) - Get the pilot data and clean it # (2) - Run the model on the pilot data and extract effects # (3) - Set up and run the simulation # ====> Set variables at the arrows <==== # ############################################################################## rm(list=ls()) set.seed(424242) library(readr) library(ggplot2) # (1) - Get the pilot data and clean it #source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R') #source ('/data/users/mgaughan/kkex_data_110823_3') data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) data2 <- read_csv('../inst_all_packages_full_results.csv') #d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) #levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors") python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)]) same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)]) data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) data1$milestones <- as.numeric(data1$milestones > 0) # (2) - Run the model on the pilot data data1$formal.score <- data1$mmt / (data1$milestones/data1$age) table(data1$milestones) hist(data1$old_mmt) #inequality of participation hist(data1$formal.score) hist(data1$age/365) kmodel1 <- lm(up.fac.mean ~ mmt, data=data1) summary(kmodel1) kmodel1 <- lm(up.fac.mean ~ old_mmt, data=data1) summary(kmodel1) kmodel1 <- lm(up.fac.mean ~ formal.score, data=data1) summary(kmodel1) hist(data1$formal.score) cor.test(data1$formal.score, data1$up.fac.mean) cor.test(data1$mmt, data1$up.fac.mean) cor.test(data1$milestones, data1$up.fac.mean) cor.test(data1$age, data1$up.fac.mean) g <- ggplot(data1, aes(x=formal.score, y=up.fac.mean)) + geom_point() + geom_smooth() g data2 <- subset(data1, (data1$age / 365) < 14 ) hist(data2$age) g <- ggplot(data2, aes(x=formal.score, y=up.fac.mean)) + geom_point() + geom_smooth() g data2$yearsOld <- data2$age / 365 kmodel2 <- lm(up.fac.mean ~ mmt + milestones + yearsOld, data=data2) summary(kmodel2) #pilotM <- glm(up.fac.mean ~ ((mmt) / (milestones/age)), # give the anticipated regression a try # family=gaussian(link='identity'), data=data1) summary(pilotM) #we expect effect sizes on this order pilot.b0 <- coef(summary(pilotM))[1,1] pilot.b1 <- coef(summary(pilotM))[2,1] pilot.b2 <- coef(summary(pilotM))[3,1] pilot.b3 <- coef(summary(pilotM))[4,1] # (3) - Set up and run the simulation source('powerAnalysis.R') #my little "lib" #====> nSims <- 5000 #how many simulations to run n <- 100 #a guess for necessary sample size (per group) #makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work #<==== #print("Levels are:") #print(levels(d$source)) powerCheck(n, nSims) #Sample values powerCheck(50, 100) powerCheck(80, 1000) powerCheck(200, 5000)