2023-11-08 17:10:54 +00:00
|
|
|
##############################################################################
|
|
|
|
#
|
|
|
|
# Purpose:
|
|
|
|
# Use pilot project data to calculate power of a full study through simulation
|
|
|
|
#
|
|
|
|
# Parts:
|
|
|
|
# (0) - Setup
|
|
|
|
# (1) - Get the pilot data and clean it
|
|
|
|
# (2) - Run the model on the pilot data and extract effects
|
|
|
|
# (3) - Set up and run the simulation
|
|
|
|
# ====> Set variables at the arrows <====
|
|
|
|
#
|
|
|
|
##############################################################################
|
|
|
|
rm(list=ls())
|
|
|
|
set.seed(424242)
|
|
|
|
|
2023-11-09 16:45:16 +00:00
|
|
|
library(readr)
|
2023-11-10 19:22:58 +00:00
|
|
|
library(ggplot2)
|
2023-11-09 16:45:16 +00:00
|
|
|
|
2023-11-08 17:10:54 +00:00
|
|
|
# (1) - Get the pilot data and clean it
|
2023-11-09 02:46:12 +00:00
|
|
|
#source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R')
|
2023-11-09 16:45:16 +00:00
|
|
|
#source ('/data/users/mgaughan/kkex_data_110823_3')
|
2023-12-05 15:46:31 +00:00
|
|
|
|
2023-11-10 21:46:26 +00:00
|
|
|
data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)
|
2023-11-10 19:22:58 +00:00
|
|
|
data2 <- read_csv('../inst_all_packages_full_results.csv')
|
2023-11-09 16:45:16 +00:00
|
|
|
#d$nd <- to_logical(d$not.damaging, custom_true=c("Y"))
|
|
|
|
#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors")
|
2023-11-13 16:52:40 +00:00
|
|
|
python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)])
|
|
|
|
same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)])
|
|
|
|
data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
|
2023-12-05 15:46:31 +00:00
|
|
|
data1$old_milestones <- data1$milestones
|
|
|
|
data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
|
2023-11-08 17:10:54 +00:00
|
|
|
# (2) - Run the model on the pilot data
|
2023-12-05 15:46:31 +00:00
|
|
|
data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age)
|
|
|
|
table(data1$formal.score)
|
2023-11-16 03:31:08 +00:00
|
|
|
hist(data1$old_mmt, prob=TRUE) #inequality of participation
|
2024-01-28 22:26:41 +00:00
|
|
|
median(data1$contributors)
|
|
|
|
median(data1$collaborators)
|
|
|
|
median(data1$age/365)
|
2023-11-16 03:31:08 +00:00
|
|
|
data1$new_mmt <- data1$mmt - 1
|
|
|
|
hist(data1$new_mmt, prob=TRUE)
|
2023-11-15 04:54:32 +00:00
|
|
|
|
2023-12-05 15:46:31 +00:00
|
|
|
data3 <- subset(data1, data1$old_milestones > 0 )
|
|
|
|
data3$formal.score <- data3$mmt / (data3$old_milestones/data3$age)
|
|
|
|
|
2023-11-15 04:54:32 +00:00
|
|
|
data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
|
2023-12-05 15:46:31 +00:00
|
|
|
data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
|
2023-11-15 04:54:32 +00:00
|
|
|
hist(as.numeric(data1$new.age))
|
2023-12-05 15:46:31 +00:00
|
|
|
table(data1$new.age)
|
|
|
|
hist(data1$new.formal.score)
|
|
|
|
data5 <- subset(data1, is.finite(data1$formal.score))
|
|
|
|
mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1)
|
|
|
|
summary(mmtmodel1)
|
|
|
|
agemodel1 <- lm(up.fac.mean ~ new.age, data=data1)
|
|
|
|
summary(agemodel1)
|
|
|
|
msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1)
|
|
|
|
summary(msmodel1)
|
|
|
|
msmodel2 <- lm(up.fac.mean ~ new_milestones, data=data1)
|
|
|
|
summary(msmodel2)
|
|
|
|
fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data5)
|
|
|
|
summary(fsmodel1)
|
|
|
|
t.test(data3$formal.score)
|
|
|
|
fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1)
|
|
|
|
summary(fsmodel2)
|
2023-11-10 19:22:58 +00:00
|
|
|
hist(data1$formal.score)
|
|
|
|
cor.test(data1$formal.score, data1$up.fac.mean)
|
|
|
|
cor.test(data1$mmt, data1$up.fac.mean)
|
|
|
|
cor.test(data1$milestones, data1$up.fac.mean)
|
|
|
|
cor.test(data1$age, data1$up.fac.mean)
|
|
|
|
|
2024-01-28 22:26:41 +00:00
|
|
|
data1$new.age.factor <- as.factor(data1$new.age)
|
|
|
|
#geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange")+
|
|
|
|
|
2023-12-05 15:46:31 +00:00
|
|
|
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
|
2023-11-10 19:22:58 +00:00
|
|
|
geom_point() +
|
2024-01-28 22:26:41 +00:00
|
|
|
#geom_smooth( method="lm", formula=(y~x), colour = "orange")+
|
|
|
|
geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange", size=1)+
|
|
|
|
geom_errorbar(aes(ymin=y-yerr, ymax=y+yerr), width=0.09)+
|
2023-12-05 15:46:31 +00:00
|
|
|
xlab("MMT") +
|
|
|
|
ylab("Underproduction Factor") +
|
|
|
|
theme_bw()
|
|
|
|
g
|
2023-11-10 19:22:58 +00:00
|
|
|
g
|
|
|
|
|
2024-01-28 22:26:41 +00:00
|
|
|
colors_legend <- c("a"="#E69F00","b"="#56B4E9", "c"="#D55E00","d"="#CC79A7")
|
|
|
|
#colors_legend <- c("0-9y"="red","9-12y"="green", "12-15y"="blue","15-16y"="orange")
|
|
|
|
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
|
|
|
|
geom_point() +
|
|
|
|
geom_abline(aes(intercept=1.65, slope=-1.38, color="a"), size=1.5)+
|
|
|
|
geom_abline(aes(intercept=1.72, slope=-1.38, color="b"), size=1.5)+
|
|
|
|
geom_abline(aes(intercept=2.25, slope=-1.38, color="c" ), size=1.5)+
|
|
|
|
geom_abline(aes(intercept=2.8, slope=-1.38, color="d") , size=1.5)+
|
|
|
|
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
|
|
|
|
scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
|
|
|
|
theme_bw()+
|
|
|
|
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
|
|
|
g
|
|
|
|
|
|
|
|
|
2023-11-13 16:52:40 +00:00
|
|
|
data2 <- subset(data1, (data1$age / 365) < 14 )
|
2023-11-15 04:54:32 +00:00
|
|
|
hist(floor(data2$age))
|
2023-11-16 03:31:08 +00:00
|
|
|
g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) +
|
2023-11-10 19:22:58 +00:00
|
|
|
geom_point() +
|
2023-12-05 15:46:31 +00:00
|
|
|
geom_smooth() +
|
|
|
|
xlab("MMT") +
|
|
|
|
ylab("Underproduction Factor") +
|
|
|
|
theme_bw()
|
2023-11-10 19:22:58 +00:00
|
|
|
g
|
|
|
|
|
2023-11-15 04:54:32 +00:00
|
|
|
data2$yearsOld <- floor(data2$age / 365)
|
2023-11-10 19:22:58 +00:00
|
|
|
|
2023-11-14 21:44:28 +00:00
|
|
|
kmodel2 <- lm(up.fac.mean ~ mmt + milestones + age, data=data1)
|
2023-11-15 00:14:31 +00:00
|
|
|
kmodel5 <- lm(up.fac.mean ~ mmt + milestones, data=data1)
|
2023-11-14 21:44:28 +00:00
|
|
|
kmodel4 <- lm(up.fac.mean ~ mmt + age, data=data1)
|
|
|
|
kmodel3 <- lm(up.fac.mean ~ formal.score, data=data1)
|
2023-11-10 19:22:58 +00:00
|
|
|
summary(kmodel2)
|
2023-11-14 21:44:28 +00:00
|
|
|
summary(kmodel3)
|
|
|
|
summary(kmodel4)
|
2023-11-15 00:14:31 +00:00
|
|
|
summary(kmodel5)
|
2023-11-10 19:22:58 +00:00
|
|
|
|
|
|
|
#pilotM <- glm(up.fac.mean ~ ((mmt) / (milestones/age)), # give the anticipated regression a try
|
|
|
|
# family=gaussian(link='identity'), data=data1)
|
2023-11-08 17:10:54 +00:00
|
|
|
summary(pilotM) #we expect effect sizes on this order
|
|
|
|
|
2023-11-14 04:30:40 +00:00
|
|
|
pilot.b0 <- coef(summary(kmodel2))[1,1]
|
|
|
|
pilot.b1 <- coef(summary(kmodel2))[2,1]
|
|
|
|
pilot.b2 <- coef(summary(kmodel2))[3,1]
|
|
|
|
pilot.b3 <- coef(summary(kmodel2))[4,1]
|
2023-11-08 17:10:54 +00:00
|
|
|
|
|
|
|
|
2023-11-14 21:44:28 +00:00
|
|
|
summary(pilot.b3)
|
|
|
|
|
|
|
|
qqline(data1$up.fac.mean)
|
|
|
|
|
|
|
|
sd(data1$up.fac.mean)
|
2023-11-08 17:10:54 +00:00
|
|
|
# (3) - Set up and run the simulation
|
2023-11-14 21:44:28 +00:00
|
|
|
qqline(data1$mmt)
|
2023-11-08 17:10:54 +00:00
|
|
|
|
|
|
|
source('powerAnalysis.R') #my little "lib"
|
|
|
|
|
|
|
|
#====>
|
|
|
|
nSims <- 5000 #how many simulations to run
|
|
|
|
n <- 100 #a guess for necessary sample size (per group)
|
|
|
|
#makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work
|
|
|
|
#<====
|
|
|
|
|
2023-12-05 15:46:31 +00:00
|
|
|
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
|
|
|
|
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
|
|
|
|
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
|
|
|
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
|
|
|
|
|
|
|
coef(fsmodel1)
|
2023-11-10 17:31:43 +00:00
|
|
|
#print("Levels are:")
|
|
|
|
#print(levels(d$source))
|
2023-11-08 17:10:54 +00:00
|
|
|
powerCheck(n, nSims)
|
2023-11-14 21:44:28 +00:00
|
|
|
#powerCheck2(n, nSims) like doesn't really work
|
2023-11-08 17:10:54 +00:00
|
|
|
|
|
|
|
#Sample values
|
2023-12-05 15:46:31 +00:00
|
|
|
powerCheck(300, 1000)
|
|
|
|
powerCheck(275, 1000)
|
|
|
|
powerCheck(7000, 1000)
|
2023-11-15 00:14:31 +00:00
|
|
|
|
|
|
|
powerCheck2(50, 1000)
|
2023-12-05 15:46:31 +00:00
|
|
|
powerCheck2(75, 1000)
|
|
|
|
powerCheck2(900, 1000)
|