stats analysis from the short paper

This commit is contained in:
mjgaughan 2023-12-05 09:46:31 -06:00
parent 5216ca767c
commit 5dc0406c82
11 changed files with 572 additions and 27 deletions

512
R/.Rhistory Normal file
View File

@ -0,0 +1,512 @@
data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
# (2) - Run the model on the pilot data
data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age)
table(data1$milestones)
table(data1$old_milestones)
hist(data1$old_mmt, prob=TRUE) #inequality of participation
hist(data1$formal.score)
data1$new_mmt <- data1$mmt - 1
hist(data1$new_mmt, prob=TRUE)
data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
hist(as.numeric(data1$new.age))
hist(data1$formal.score)
hist(data1$new.formal.score)
fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data1)
summary(kmodel1)
summary(fsmodel1)
kmodel3 <- lm(up.fac.mean ~ formal.score, data=data1)
# (2) - Run the model on the pilot data
data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age)
table(data1$formal.score)
fsmodel1 <- lm(up.fac.mean ~ is.finite(formal.score), data=data1)
summary(fsmodel1)
fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1)
summary(kmodel2)
summary(fsmodel2)
mmtmodel1 <- lm(up.fac.mean ~ mmt, data=data1)
summary(mmtmodel1)
agemodel1 <- lm(up.fac.mean ~ new.age, data=data1)
summary(agemodel1)
msmodel1 <- lm(up.fac.mean ~ old_milestones, data=data1)
summary(msmodel1)
msmodel2 <- lm(up.fac.mean ~ new_milestones, data=data1)
summary(msmodel2)
texreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
install.packages(textref)
install.packages(textreg)
library(textreg)
install.packages("textreg_0.1.tar.gz", repos = NULL, type="source")
install.packages("textreg_0.1.tar.gz", repos = NULL, type="source")
install.packages("textreg_0.1.5.tar.gz", repos = NULL, type="source")
import.packages(tm)
import.package(tm)
import.package("tm")
install.package("tm")
install.packages("tm")
install.packages("nlp")
yes
install.packages("textreg_0.1.5.tar.gz", repos = NULL, type="source")
library(textreg)
texreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
textreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
textreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
coeff(fsmodel1)
coef(fsmodel1)
textreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
coef(fsmodel2)
textreg(list(fsmodel1,fsmodel2), stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
textreg(list(fsmodel1,fsmodel2), stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2), stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
install.packages(texreg)
install.packages("texreg_1.39.3.tar.gz", repos = NULL, type="source")
library(texreg)
texreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score'),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score', 'M3: original milestones', 'M4: binomial milestones', 'M5: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score', 'M3: original milestones', 'M4: binomial milestones', 'M5: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test', 'test', 'test', 'test', 'test', ),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score', 'M3: original milestones', 'M4: binomial milestones', 'M5: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test', 'test', 'test', 'test', 'test' ),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M5: binomial milestones', 'M6: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test', 'test', 'test', 'test', 'test' ),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M5: binomial milestones', 'M6: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Relationship to Underproduction', 'test', 'test', 'test', 'test' ),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ),
custom.coef.names=c('Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'milestone usage relationship to underproduction', 'age group relationship to underproduction' ),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ),
custom.coef.names=c('Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'milestone usage relationship to underproduction', 'age group relationship to underproduction', 'test' ),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'Milestone usage relationship to underproduction', 'Age group relationship to underproduction'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
data5 <- subset(data1, is.finite(data1$formal.score))
fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data5)
summary(fsmodel1)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'Milestone usage relationship to underproduction', 'Age group relationship to underproduction'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: orig. formality', 'M2: augm. formality','M3: MMT', 'M4: milestones', 'M6: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Original formality/underproduction', 'Augmented formality/Underproduction', 'MMT/underproduction', 'Milestones / underproduction', 'Age/underproduction'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c('M1: orig. formality', 'M2: augm. formality','M3: MMT', 'M4: milestones', 'M5: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Original formality', 'Augmented formality', 'MMT', 'Milestones', 'Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones', 'M4: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Original formality', 'Augmented formality', 'MMT', 'Milestones', 'Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones', 'M4: age (grouped)' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'Age'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
#Sample values
powerCheck(300, 1000)
source('powerAnalysis.R') #my little "lib"
#Sample values
powerCheck(300, 1000)
powerCheck(200, 1000)
powerCheck(250, 1000)
powerCheck(275, 5000)
powerCheck(275, 1000)
source('powerAnalysis.R') #my little "lib"
#Sample values
powerCheck(300, 1000)
powerCheck(275, 1000)
powerCheck(500, 1000)
powerCheck(700, 1000)
powerCheck(7000, 1000)
fsmodel2 <- lm(up.fac.mean ~ new.formal.score + as.factor(new.age), data=data1)
summary(fsmodel2)
mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1)
summary(mmtmodel1)
mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(age), data=data1)
summary(mmtmodel1)
mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1)
summary(mmtmodel1)
msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1)
summary(msmodel1)
fsmodel2 <- lm(up.fac.mean ~ new.formal.score + as.factor(new.age), data=data1)
summary(fsmodel2)
texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'test'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'test'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1), omit.coef = 'factor', stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
source('powerAnalysis.R') #my little "lib"
powerCheck(250, 1000)
powerCheck(275, 1000)
#Sample values
powerCheck(300, 1000)
summary(mmtmodel1)
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'age 2', 'age 3', 'age 4'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1)
msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1)
fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1)
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'age 2', 'age 3', 'age 4'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'age 2', 'age 3', 'age 4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
summary(msmodel1)
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g
library(ggplot2)
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g
msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1)
summary(msmodel1)
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()+
scale_color_brewer(palette="Dark2")
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_brewer(palette="Dark2")
g + scale_color_viridis_b()
g + scale_color_viridis_d()
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b()
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b()
g + scale_color_viridis_b() scale_fill_continuous(name = "New Legend Title")
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b() scale_fill_continuous(name = "New Legend Title")
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b() + scale_fill_continuous(name = "New Legend Title")
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b() + scale_fill_continuous(name = "New Legend Title")
g + scale_color_viridis_b() + scale_color_continuous(name = "New Legend Title")
g + scale_color_viridis_b() + scale_color_continuous(name = "Age Group")
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b() + scale_color_continuous(name = "Age Group")
g
g + scale_color_viridis_b(name = "Age Group")
g + scale_color_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4'))
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4'))
g
g + scale_fill_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4'))
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_fill_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4'))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_fill_viridis_b(name = "Age Group")
g
g + scale_fill_viridis_b(name = "Age Group")
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_colour_viridis_b(name = "Age Group")
g + scale_fill_continuous(name = "Age Group")
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_fill_continuous(name = "Age Group")
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor")
g + scale_fill_continuous(name = "Age Group")
g + scale_fill_continuous(name = "Age Group", labels=c("Woman", "Man"))
g + scale_color_continuous(name = "Age Group", labels=c("Woman", "Man"))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_continuous(name = "Age Group", labels=c("Woman", "Man"))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g + scale_color_continuous(type = "viridis", name = "Age Group", labels=c("Woman", "Man"))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group")
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group", labels="0-9y", "9-12y", "12-15y", "15-17y")
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y"))
g
+ theme(legend.position = c(0.8, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.8, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.1))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.2))
g
summary(mmtmodel1)
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=as.factor(new.age))+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=as.factor(new.age))+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_discrete(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_identity(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=new.age)+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_identity(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
aes(x=mmt, y=up.fac.mean, color=round(new.age))+
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
scale_color_identity(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +
theme(legend.position = c(0.1, 0.2))
g
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() +
geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g

View File

@ -1,7 +1,7 @@
library(dplyr) library(dplyr)
df<-read.csv('~/Research/kkex_repo/power_data_111023_mmt.csv') df<-read.csv('../power_data_111023_mmt.csv')
df1 <- read.csv('~/Research/kkex_repo/inst_all_packages_full_results.csv') df1 <- read.csv('../inst_all_packages_full_results.csv')
hist(df$age/365) #there's a big bump at 9 years, why? hist(df$age/365) #there's a big bump at 9 years, why?
hist(df$contributors) #skewed hist(df$contributors) #skewed

View File

@ -20,6 +20,7 @@ library(ggplot2)
# (1) - Get the pilot data and clean it # (1) - Get the pilot data and clean it
#source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R') #source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R')
#source ('/data/users/mgaughan/kkex_data_110823_3') #source ('/data/users/mgaughan/kkex_data_110823_3')
data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)
data2 <- read_csv('../inst_all_packages_full_results.csv') data2 <- read_csv('../inst_all_packages_full_results.csv')
#d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) #d$nd <- to_logical(d$not.damaging, custom_true=c("Y"))
@ -27,45 +28,62 @@ data2 <- read_csv('../inst_all_packages_full_results.csv')
python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)]) python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)])
same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)]) same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)])
data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
data1$milestones <- as.numeric(data1$milestones > 0) + 1 data1$old_milestones <- data1$milestones
data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
# (2) - Run the model on the pilot data # (2) - Run the model on the pilot data
data1$formal.score <- data1$mmt / (data1$milestones/data1$age) data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age)
table(data1$milestones) table(data1$formal.score)
hist(data1$old_mmt, prob=TRUE) #inequality of participation hist(data1$old_mmt, prob=TRUE) #inequality of participation
hist(data1$formal.score) hist(data1$formal.score)
hist(data1$age/365) hist(data1$age/365)
data1$new_mmt <- data1$mmt - 1 data1$new_mmt <- data1$mmt - 1
hist(data1$new_mmt, prob=TRUE) hist(data1$new_mmt, prob=TRUE)
data3 <- subset(data1, data1$old_milestones > 0 )
data3$formal.score <- data3$mmt / (data3$old_milestones/data3$age)
data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
data1$formal.score <- data1$mmt / (data1$milestones/data1$new.age) data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
hist(as.numeric(data1$new.age)) hist(as.numeric(data1$new.age))
table(data1$formal.score) table(data1$new.age)
hist(data1$formal.score) hist(data1$new.formal.score)
kmodel1 <- lm(up.fac.mean ~ mmt, data=data1) data5 <- subset(data1, is.finite(data1$formal.score))
summary(kmodel1) mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1)
kmodel1 <- lm(up.fac.mean ~ new_mmt, data=data1) summary(mmtmodel1)
summary(kmodel1) agemodel1 <- lm(up.fac.mean ~ new.age, data=data1)
kmodel1 <- lm(up.fac.mean ~ new.age, data=data1) summary(agemodel1)
summary(kmodel1) msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1)
kmodel1 <- lm(up.fac.mean ~ milestones, data=data1) summary(msmodel1)
summary(kmodel1) msmodel2 <- lm(up.fac.mean ~ new_milestones, data=data1)
summary(msmodel2)
fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data5)
summary(fsmodel1)
t.test(data3$formal.score)
fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1)
summary(fsmodel2)
hist(data1$formal.score) hist(data1$formal.score)
cor.test(data1$formal.score, data1$up.fac.mean) cor.test(data1$formal.score, data1$up.fac.mean)
cor.test(data1$mmt, data1$up.fac.mean) cor.test(data1$mmt, data1$up.fac.mean)
cor.test(data1$milestones, data1$up.fac.mean) cor.test(data1$milestones, data1$up.fac.mean)
cor.test(data1$age, data1$up.fac.mean) cor.test(data1$age, data1$up.fac.mean)
g <- ggplot(data1, aes(x=new_mmt, y=up.fac.mean)) + g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_point() + geom_point() +
geom_smooth() geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g
g g
data2 <- subset(data1, (data1$age / 365) < 14 ) data2 <- subset(data1, (data1$age / 365) < 14 )
hist(floor(data2$age)) hist(floor(data2$age))
g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) + g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) +
geom_point() + geom_point() +
geom_smooth() geom_smooth() +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g g
data2$yearsOld <- floor(data2$age / 365) data2$yearsOld <- floor(data2$age / 365)
@ -105,16 +123,22 @@ n <- 100 #a guess for necessary sample size (per group)
#makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work #makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work
#<==== #<====
texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
coef(fsmodel1)
#print("Levels are:") #print("Levels are:")
#print(levels(d$source)) #print(levels(d$source))
powerCheck(n, nSims) powerCheck(n, nSims)
#powerCheck2(n, nSims) like doesn't really work #powerCheck2(n, nSims) like doesn't really work
#Sample values #Sample values
powerCheck(100, 1000)
powerCheck(200, 1000)
powerCheck(300, 1000) powerCheck(300, 1000)
powerCheck(275, 1000)
powerCheck(7000, 1000)
powerCheck2(50, 1000) powerCheck2(50, 1000)
powerCheck2(200, 1000) powerCheck2(75, 1000)
powerCheck2(500, 1000) powerCheck2(900, 1000)

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

BIN
R/mess-mmt-ggplot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
R/mmt-underprod-ggplot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

BIN
R/newmmt-underprod-plot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@ -37,7 +37,8 @@ makeDataNew2 <- function(n) {
tDF <- data.frame( tDF <- data.frame(
## don't sim the outcome ## don't sim the outcome
#up.fac.mean=rnorm(n=n, mean=-0.1296376, sd=1.479847), # up.fac.mean #up.fac.mean=rnorm(n=n, mean=-0.1296376, sd=1.479847), # up.fac.mean
formal.score=rlnorm(n=n, mean=6.220282, sd = 2.544058) # formal.score #formal.score=rlnorm(n=n, mean=6.220282, sd = 2.544058) # formal.score
formal.score=rbeta(n=n, 1, 3) * 10000
) )
tDF[is.na(tDF) | tDF=="Inf"] = NA tDF[is.na(tDF) | tDF=="Inf"] = NA
#sDF <- melt(tDF, id.vars = 0) #AKA the index is the unique id, as far as that goes #sDF <- melt(tDF, id.vars = 0) #AKA the index is the unique id, as far as that goes
@ -58,7 +59,7 @@ powerCheck <- function(n, nSims) { #run a power calculation on the dataset given
## outcome goes here --v ## outcome goes here --v
# e.g. simData$up.fac.mean <- (usefuleffsizeA * mmt) + (usefuleffsizeB * milestones) + rnorm(n=1, mean=0, sd=1) ##plus some noise # e.g. simData$up.fac.mean <- (usefuleffsizeA * mmt) + (usefuleffsizeB * milestones) + rnorm(n=1, mean=0, sd=1) ##plus some noise
#simData$up.fac.mean <- (-2.075 * simData$mmt) + (0.4284 * simData$milestones) + rnorm(n=1, mean=0, sd=1) #simData$up.fac.mean <- (-2.075 * simData$mmt) + (0.4284 * simData$milestones) + rnorm(n=1, mean=0, sd=1)
simData$up.fac.mean <- (-2.0745 * simData$new_mmt) + (0.4284 * simData$milestones) + rnorm(n=n, mean=0, sd=1) simData$up.fac.mean <- (-1.38 * simData$new_mmt) + (0.40 * simData$milestones) + rnorm(n=n, mean=0, sd=1)
#have updated for kkex through here, now need to look at the underproduction work #have updated for kkex through here, now need to look at the underproduction work
#m1.sim <- lm(up.fac.mean ~ ((mmt)/ (milestones/age)), data=simData) #m1.sim <- lm(up.fac.mean ~ ((mmt)/ (milestones/age)), data=simData)
## could leave age out for now? ## could leave age out for now?
@ -88,7 +89,7 @@ powerCheck2 <- function(n, nSims) { #run a power calculation on the dataset give
#have updated for kkex through here, now need to look at the underproduction work #have updated for kkex through here, now need to look at the underproduction work
#m1.sim <- lm(up.fac.mean ~ ((mmt)/ (milestones/age)), data=simData) #m1.sim <- lm(up.fac.mean ~ ((mmt)/ (milestones/age)), data=simData)
## outcome goes here --v ## outcome goes here --v
simData$up.fac.mean <- (0.5 * simData$formal.score) + rnorm(1, mean=0, sd=1) ##plus some noise simData$up.fac.mean <- (0.00017 * simData$formal.score) + rnorm(n, mean=0, sd=1) ##plus some noise
m1.sim <- lm(up.fac.mean ~ formal.score, data=simData) m1.sim <- lm(up.fac.mean ~ formal.score, data=simData)
p0 <- coef(summary(m1.sim))[1,4] p0 <- coef(summary(m1.sim))[1,4]
p1 <- coef(summary(m1.sim))[2,4] p1 <- coef(summary(m1.sim))[2,4]

BIN
R/texreg_1.39.3.tar.gz Normal file

Binary file not shown.

BIN
R/textreg_0.1.5.tar.gz Normal file

Binary file not shown.

8
kkex-github-api-key.rtf Normal file
View File

@ -0,0 +1,8 @@
{\rtf1\ansi\ansicpg1252\cocoartf2708
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
{\*\expandedcolortbl;;}
\margl1440\margr1440\vieww11520\viewh8400\viewkind0
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
\f0\fs24 \cf0 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}