24_deb_pkg_gov/R/calculatePower.R

##############################################################################
#
# Purpose:
# Use pilot project data to calculate power of a full study through simulation
#
# Parts:
# (0) - Setup
# (1) - Get the pilot data and clean it
# (2) - Run the model on the pilot data and extract effects
# (3) - Set up and run the simulation
# ====> Set variables at the arrows <====
#
##############################################################################
rm(list=ls())
set.seed(424242) 

library(readr)
library(ggplot2)

# (1) - Get the pilot data and clean it
#source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R')
#source ('/data/users/mgaughan/kkex_data_110823_3')
data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)
data2 <- read_csv('../inst_all_packages_full_results.csv')
#d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) 
#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors")
python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)])
same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)])
data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
data1$milestones <- as.numeric(data1$milestones > 0) + 1
# (2) - Run the model on the pilot data
data1$formal.score <- data1$mmt / (data1$milestones/data1$age)
table(data1$milestones)
hist(data1$old_mmt, prob=TRUE) #inequality of participation
hist(data1$formal.score)
hist(data1$age/365)
data1$new_mmt <- data1$mmt - 1
hist(data1$new_mmt, prob=TRUE)

data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
data1$formal.score <- data1$mmt / (data1$milestones/data1$new.age)
hist(as.numeric(data1$new.age))
table(data1$formal.score)
hist(data1$formal.score)
kmodel1 <- lm(up.fac.mean ~ mmt, data=data1)
summary(kmodel1)
kmodel1 <- lm(up.fac.mean ~ new_mmt, data=data1)
summary(kmodel1)
kmodel1 <- lm(up.fac.mean ~ new.age, data=data1)
summary(kmodel1)
kmodel1 <- lm(up.fac.mean ~ milestones, data=data1)
summary(kmodel1)
hist(data1$formal.score)
cor.test(data1$formal.score, data1$up.fac.mean)
cor.test(data1$mmt, data1$up.fac.mean)
cor.test(data1$milestones, data1$up.fac.mean)
cor.test(data1$age, data1$up.fac.mean)

g <- ggplot(data1, aes(x=new_mmt, y=up.fac.mean)) +
  geom_point() +
  geom_smooth() 
g

data2 <- subset(data1, (data1$age / 365) < 14 )
hist(floor(data2$age))
g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) +
  geom_point() +
  geom_smooth() 
g

data2$yearsOld <- floor(data2$age / 365)

kmodel2 <- lm(up.fac.mean ~ mmt + milestones + age, data=data1)
kmodel5 <- lm(up.fac.mean ~ mmt + milestones, data=data1)
kmodel4 <- lm(up.fac.mean ~ mmt + age, data=data1)
kmodel3 <- lm(up.fac.mean ~ formal.score, data=data1)
summary(kmodel2)
summary(kmodel3)
summary(kmodel4)
summary(kmodel5)

#pilotM <- glm(up.fac.mean ~ ((mmt) / (milestones/age)),   # give the anticipated regression a try
#                  family=gaussian(link='identity'), data=data1)
summary(pilotM) #we expect effect sizes on this order

pilot.b0 <- coef(summary(kmodel2))[1,1] 
pilot.b1 <- coef(summary(kmodel2))[2,1]
pilot.b2 <- coef(summary(kmodel2))[3,1]
pilot.b3 <- coef(summary(kmodel2))[4,1] 


summary(pilot.b3)

qqline(data1$up.fac.mean)

sd(data1$up.fac.mean)
# (3) - Set up and run the simulation
qqline(data1$mmt)

source('powerAnalysis.R') #my little "lib"

#====>
nSims <- 5000 #how many simulations to run
n <- 100 #a guess for necessary sample size (per group)
#makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work
#<====

#print("Levels are:")
#print(levels(d$source))
powerCheck(n, nSims)
#powerCheck2(n, nSims) like doesn't really work

#Sample values
powerCheck(100, 1000) 
powerCheck(200, 1000) 
powerCheck(300, 1000)

powerCheck2(50, 1000) 
powerCheck2(200, 1000) 
powerCheck2(500, 1000)
adds my power analysis example 2023-11-08 17:10:54 +00:00			`##############################################################################`
			`#`
			`# Purpose:`
			`# Use pilot project data to calculate power of a full study through simulation`
			`#`
			`# Parts:`
			`# (0) - Setup`
			`# (1) - Get the pilot data and clean it`
			`# (2) - Run the model on the pilot data and extract effects`
			`# (3) - Set up and run the simulation`
			`# ====> Set variables at the arrows <====`
			`#`
			`##############################################################################`
			`rm(list=ls())`
			`set.seed(424242)`

loading in csv data 2023-11-09 16:45:16 +00:00			`library(readr)`
demos some changes 2023-11-10 19:22:58 +00:00			`library(ggplot2)`
loading in csv data 2023-11-09 16:45:16 +00:00
adds my power analysis example 2023-11-08 17:10:54 +00:00			`# (1) - Get the pilot data and clean it`
getting started on R things 2023-11-09 02:46:12 +00:00			`#source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R')`
loading in csv data 2023-11-09 16:45:16 +00:00			`#source ('/data/users/mgaughan/kkex_data_110823_3')`
initial exploration of new data 2023-11-10 21:46:26 +00:00			`data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)`
demos some changes 2023-11-10 19:22:58 +00:00			`data2 <- read_csv('../inst_all_packages_full_results.csv')`
loading in csv data 2023-11-09 16:45:16 +00:00			`#d$nd <- to_logical(d$not.damaging, custom_true=c("Y"))`
			`#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors")`
expanding matching for data 2023-11-13 16:52:40 +00:00			`python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)])`
			`same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)])`
			`data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)`
updates to analysis 2023-11-14 04:30:40 +00:00			`data1$milestones <- as.numeric(data1$milestones > 0) + 1`
adds my power analysis example 2023-11-08 17:10:54 +00:00			`# (2) - Run the model on the pilot data`
demos some changes 2023-11-10 19:22:58 +00:00			`data1$formal.score <- data1$mmt / (data1$milestones/data1$age)`
			`table(data1$milestones)`
working power analysis 2023-11-16 03:31:08 +00:00			`hist(data1$old_mmt, prob=TRUE) #inequality of participation`
demos some changes 2023-11-10 19:22:58 +00:00			`hist(data1$formal.score)`
			`hist(data1$age/365)`
working power analysis 2023-11-16 03:31:08 +00:00			`data1$new_mmt <- data1$mmt - 1`
			`hist(data1$new_mmt, prob=TRUE)`
binning age and trying to figure out power 2023-11-15 04:54:32 +00:00
			`data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))`
			`data1$formal.score <- data1$mmt / (data1$milestones/data1$new.age)`
			`hist(as.numeric(data1$new.age))`
			`table(data1$formal.score)`
more fiddling w power analysis 2023-11-15 17:24:51 +00:00			`hist(data1$formal.score)`
demos some changes 2023-11-10 19:22:58 +00:00			`kmodel1 <- lm(up.fac.mean ~ mmt, data=data1)`
			`summary(kmodel1)`
working power analysis 2023-11-16 03:31:08 +00:00			`kmodel1 <- lm(up.fac.mean ~ new_mmt, data=data1)`
expanding matching for data 2023-11-13 16:52:40 +00:00			`summary(kmodel1)`
binning age and trying to figure out power 2023-11-15 04:54:32 +00:00			`kmodel1 <- lm(up.fac.mean ~ new.age, data=data1)`
			`summary(kmodel1)`
more fiddling w power analysis 2023-11-15 17:24:51 +00:00			`kmodel1 <- lm(up.fac.mean ~ milestones, data=data1)`
			`summary(kmodel1)`
demos some changes 2023-11-10 19:22:58 +00:00			`hist(data1$formal.score)`
			`cor.test(data1$formal.score, data1$up.fac.mean)`
			`cor.test(data1$mmt, data1$up.fac.mean)`
			`cor.test(data1$milestones, data1$up.fac.mean)`
			`cor.test(data1$age, data1$up.fac.mean)`

working power analysis 2023-11-16 03:31:08 +00:00			`g <- ggplot(data1, aes(x=new_mmt, y=up.fac.mean)) +`
demos some changes 2023-11-10 19:22:58 +00:00			`geom_point() +`
more fiddling w power analysis 2023-11-15 17:24:51 +00:00			`geom_smooth()`
demos some changes 2023-11-10 19:22:58 +00:00			`g`

expanding matching for data 2023-11-13 16:52:40 +00:00			`data2 <- subset(data1, (data1$age / 365) < 14 )`
binning age and trying to figure out power 2023-11-15 04:54:32 +00:00			`hist(floor(data2$age))`
working power analysis 2023-11-16 03:31:08 +00:00			`g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) +`
demos some changes 2023-11-10 19:22:58 +00:00			`geom_point() +`
more fiddling w power analysis 2023-11-15 17:24:51 +00:00			`geom_smooth()`
demos some changes 2023-11-10 19:22:58 +00:00			`g`

binning age and trying to figure out power 2023-11-15 04:54:32 +00:00			`data2$yearsOld <- floor(data2$age / 365)`
demos some changes 2023-11-10 19:22:58 +00:00
drafted power analysis 2023-11-14 21:44:28 +00:00			`kmodel2 <- lm(up.fac.mean ~ mmt + milestones + age, data=data1)`
model without age, and a few more sim samples 2023-11-15 00:14:31 +00:00			`kmodel5 <- lm(up.fac.mean ~ mmt + milestones, data=data1)`
drafted power analysis 2023-11-14 21:44:28 +00:00			`kmodel4 <- lm(up.fac.mean ~ mmt + age, data=data1)`
			`kmodel3 <- lm(up.fac.mean ~ formal.score, data=data1)`
demos some changes 2023-11-10 19:22:58 +00:00			`summary(kmodel2)`
drafted power analysis 2023-11-14 21:44:28 +00:00			`summary(kmodel3)`
			`summary(kmodel4)`
model without age, and a few more sim samples 2023-11-15 00:14:31 +00:00			`summary(kmodel5)`
demos some changes 2023-11-10 19:22:58 +00:00
			`#pilotM <- glm(up.fac.mean ~ ((mmt) / (milestones/age)), # give the anticipated regression a try`
			`# family=gaussian(link='identity'), data=data1)`
adds my power analysis example 2023-11-08 17:10:54 +00:00			`summary(pilotM) #we expect effect sizes on this order`

updates to analysis 2023-11-14 04:30:40 +00:00			`pilot.b0 <- coef(summary(kmodel2))[1,1]`
			`pilot.b1 <- coef(summary(kmodel2))[2,1]`
			`pilot.b2 <- coef(summary(kmodel2))[3,1]`
			`pilot.b3 <- coef(summary(kmodel2))[4,1]`
adds my power analysis example 2023-11-08 17:10:54 +00:00

drafted power analysis 2023-11-14 21:44:28 +00:00			`summary(pilot.b3)`

			`qqline(data1$up.fac.mean)`

			`sd(data1$up.fac.mean)`
adds my power analysis example 2023-11-08 17:10:54 +00:00			`# (3) - Set up and run the simulation`
drafted power analysis 2023-11-14 21:44:28 +00:00			`qqline(data1$mmt)`
adds my power analysis example 2023-11-08 17:10:54 +00:00
			`source('powerAnalysis.R') #my little "lib"`

			`#====>`
			`nSims <- 5000 #how many simulations to run`
			`n <- 100 #a guess for necessary sample size (per group)`
			`#makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work`
			`#<====`

working on figuring out how to do this 2023-11-10 17:31:43 +00:00			`#print("Levels are:")`
			`#print(levels(d$source))`
adds my power analysis example 2023-11-08 17:10:54 +00:00			`powerCheck(n, nSims)`
drafted power analysis 2023-11-14 21:44:28 +00:00			`#powerCheck2(n, nSims) like doesn't really work`
adds my power analysis example 2023-11-08 17:10:54 +00:00
			`#Sample values`
working power analysis 2023-11-16 03:31:08 +00:00			`powerCheck(100, 1000)`
model without age, and a few more sim samples 2023-11-15 00:14:31 +00:00			`powerCheck(200, 1000)`
working power analysis 2023-11-16 03:31:08 +00:00			`powerCheck(300, 1000)`
model without age, and a few more sim samples 2023-11-15 00:14:31 +00:00
			`powerCheck2(50, 1000)`
			`powerCheck2(200, 1000)`
			`powerCheck2(500, 1000)`