diff --git a/R/.Rhistory b/R/.Rhistory new file mode 100644 index 0000000..2fdc808 --- /dev/null +++ b/R/.Rhistory @@ -0,0 +1,512 @@ +data1$new_milestones <- as.numeric(data1$milestones > 0) + 1 +# (2) - Run the model on the pilot data +data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age) +table(data1$milestones) +table(data1$old_milestones) +hist(data1$old_mmt, prob=TRUE) #inequality of participation +hist(data1$formal.score) +data1$new_mmt <- data1$mmt - 1 +hist(data1$new_mmt, prob=TRUE) +data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) +data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) +hist(as.numeric(data1$new.age)) +hist(data1$formal.score) +hist(data1$new.formal.score) +fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data1) +summary(kmodel1) +summary(fsmodel1) +kmodel3 <- lm(up.fac.mean ~ formal.score, data=data1) +# (2) - Run the model on the pilot data +data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age) +table(data1$formal.score) +fsmodel1 <- lm(up.fac.mean ~ is.finite(formal.score), data=data1) +summary(fsmodel1) +fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1) +summary(kmodel2) +summary(fsmodel2) +mmtmodel1 <- lm(up.fac.mean ~ mmt, data=data1) +summary(mmtmodel1) +agemodel1 <- lm(up.fac.mean ~ new.age, data=data1) +summary(agemodel1) +msmodel1 <- lm(up.fac.mean ~ old_milestones, data=data1) +summary(msmodel1) +msmodel2 <- lm(up.fac.mean ~ new_milestones, data=data1) +summary(msmodel2) +texreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +install.packages(textref) +install.packages(textreg) +library(textreg) +install.packages("textreg_0.1.tar.gz", repos = NULL, type="source") +install.packages("textreg_0.1.tar.gz", repos = NULL, type="source") +install.packages("textreg_0.1.5.tar.gz", repos = NULL, type="source") +import.packages(tm) +import.package(tm) +import.package("tm") +install.package("tm") +install.packages("tm") +install.packages("nlp") +yes +install.packages("textreg_0.1.5.tar.gz", repos = NULL, type="source") +library(textreg) +texreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +textreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +textreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +coeff(fsmodel1) +coef(fsmodel1) +textreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +coef(fsmodel2) +textreg(list(fsmodel1,fsmodel2), stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +textreg(list(fsmodel1,fsmodel2), stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2), stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +install.packages(texreg) +install.packages("texreg_1.39.3.tar.gz", repos = NULL, type="source") +library(texreg) +texreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score'), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score', 'M3: original milestones', 'M4: binomial milestones', 'M5: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score', 'M3: original milestones', 'M4: binomial milestones', 'M5: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test', 'test', 'test', 'test', 'test', ), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score', 'M3: original milestones', 'M4: binomial milestones', 'M5: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test', 'test', 'test', 'test', 'test' ), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M5: binomial milestones', 'M6: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'test', 'test', 'test', 'test', 'test' ), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, msmodel2, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M5: binomial milestones', 'M6: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Relationship to Underproduction', 'test', 'test', 'test', 'test' ), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ), +custom.coef.names=c('Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'milestone usage relationship to underproduction', 'age group relationship to underproduction' ), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ), +custom.coef.names=c('Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'milestone usage relationship to underproduction', 'age group relationship to underproduction', 'test' ), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'Milestone usage relationship to underproduction', 'Age group relationship to underproduction'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +data5 <- subset(data1, is.finite(data1$formal.score)) +fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data5) +summary(fsmodel1) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: original formality score', 'M2: augmented formality score','M3: MMT', 'M4: original milestones', 'M6: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Original formality relationship to underproduction', 'Augmented formality relationship to Underproduction', 'MMT relationship to underproduction', 'Milestone usage relationship to underproduction', 'Age group relationship to underproduction'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: orig. formality', 'M2: augm. formality','M3: MMT', 'M4: milestones', 'M6: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Original formality/underproduction', 'Augmented formality/Underproduction', 'MMT/underproduction', 'Milestones / underproduction', 'Age/underproduction'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel1,fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c('M1: orig. formality', 'M2: augm. formality','M3: MMT', 'M4: milestones', 'M5: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Original formality', 'Augmented formality', 'MMT', 'Milestones', 'Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones', 'M4: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Original formality', 'Augmented formality', 'MMT', 'Milestones', 'Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones', 'M4: age (grouped)' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'Age'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +#Sample values +powerCheck(300, 1000) +source('powerAnalysis.R') #my little "lib" +#Sample values +powerCheck(300, 1000) +powerCheck(200, 1000) +powerCheck(250, 1000) +powerCheck(275, 5000) +powerCheck(275, 1000) +source('powerAnalysis.R') #my little "lib" +#Sample values +powerCheck(300, 1000) +powerCheck(275, 1000) +powerCheck(500, 1000) +powerCheck(700, 1000) +powerCheck(7000, 1000) +fsmodel2 <- lm(up.fac.mean ~ new.formal.score + as.factor(new.age), data=data1) +summary(fsmodel2) +mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1) +summary(mmtmodel1) +mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(age), data=data1) +summary(mmtmodel1) +mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1) +summary(mmtmodel1) +msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1) +summary(msmodel1) +fsmodel2 <- lm(up.fac.mean ~ new.formal.score + as.factor(new.age), data=data1) +summary(fsmodel2) +texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1, agemodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'test'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'test'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1), omit.coef = 'factor', stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +source('powerAnalysis.R') #my little "lib" +powerCheck(250, 1000) +powerCheck(275, 1000) +#Sample values +powerCheck(300, 1000) +summary(mmtmodel1) +texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'age 2', 'age 3', 'age 4'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1) +msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1) +fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1) +texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Milestones', 'age 2', 'age 3', 'age 4'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'age 2', 'age 3', 'age 4', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +summary(msmodel1) +texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g +library(ggplot2) +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g +msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1) +summary(msmodel1) +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw()+ +scale_color_brewer(palette="Dark2") +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_brewer(palette="Dark2") +g + scale_color_viridis_b() +g + scale_color_viridis_d() +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b() +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b() +g + scale_color_viridis_b() scale_fill_continuous(name = "New Legend Title") +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b() scale_fill_continuous(name = "New Legend Title") +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b() + scale_fill_continuous(name = "New Legend Title") +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b() + scale_fill_continuous(name = "New Legend Title") +g + scale_color_viridis_b() + scale_color_continuous(name = "New Legend Title") +g + scale_color_viridis_b() + scale_color_continuous(name = "Age Group") +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b() + scale_color_continuous(name = "Age Group") +g +g + scale_color_viridis_b(name = "Age Group") +g + scale_color_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4')) +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4')) +g +g + scale_fill_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4')) +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_fill_viridis_b(name = "Age Group", labels = c('1', '2', '3', '4')) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_fill_viridis_b(name = "Age Group") +g +g + scale_fill_viridis_b(name = "Age Group") +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_colour_viridis_b(name = "Age Group") +g + scale_fill_continuous(name = "Age Group") +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_fill_continuous(name = "Age Group") +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") +g + scale_fill_continuous(name = "Age Group") +g + scale_fill_continuous(name = "Age Group", labels=c("Woman", "Man")) +g + scale_color_continuous(name = "Age Group", labels=c("Woman", "Man")) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_continuous(name = "Age Group", labels=c("Woman", "Man")) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g + scale_color_continuous(type = "viridis", name = "Age Group", labels=c("Woman", "Man")) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group") +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group", labels="0-9y", "9-12y", "12-15y", "15-17y") +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) +g ++ theme(legend.position = c(0.8, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.8, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.1)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.2)) +g +summary(mmtmodel1) +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=as.factor(new.age))+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_continuous(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=as.factor(new.age))+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_discrete(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_identity(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=new.age)+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_identity(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +aes(x=mmt, y=up.fac.mean, color=round(new.age))+ +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() + +scale_color_identity(type = "viridis", name = "Age Group", labels=c("0-9y", "9-12y", "12-15y", "15-17y")) + +theme(legend.position = c(0.1, 0.2)) +g +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + +geom_point() + +geom_smooth() + +xlab("MMT") + +ylab("Underproduction Factor") + +theme_bw() +g diff --git a/R/EDA.R b/R/EDA.R index 289ab84..5acc661 100644 --- a/R/EDA.R +++ b/R/EDA.R @@ -1,7 +1,7 @@ library(dplyr) -df<-read.csv('~/Research/kkex_repo/power_data_111023_mmt.csv') -df1 <- read.csv('~/Research/kkex_repo/inst_all_packages_full_results.csv') +df<-read.csv('../power_data_111023_mmt.csv') +df1 <- read.csv('../inst_all_packages_full_results.csv') hist(df$age/365) #there's a big bump at 9 years, why? hist(df$contributors) #skewed diff --git a/R/calculatePower.R b/R/calculatePower.R index c69389a..e7dca57 100644 --- a/R/calculatePower.R +++ b/R/calculatePower.R @@ -20,6 +20,7 @@ library(ggplot2) # (1) - Get the pilot data and clean it #source('~/Research/tor_wikipedia_edits/handcoded_edits/inter_coder_reliability_ns0.R') #source ('/data/users/mgaughan/kkex_data_110823_3') + data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) data2 <- read_csv('../inst_all_packages_full_results.csv') #d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) @@ -27,45 +28,62 @@ data2 <- read_csv('../inst_all_packages_full_results.csv') python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)]) same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)]) data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) -data1$milestones <- as.numeric(data1$milestones > 0) + 1 +data1$old_milestones <- data1$milestones +data1$new_milestones <- as.numeric(data1$milestones > 0) + 1 # (2) - Run the model on the pilot data -data1$formal.score <- data1$mmt / (data1$milestones/data1$age) -table(data1$milestones) +data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age) +table(data1$formal.score) hist(data1$old_mmt, prob=TRUE) #inequality of participation hist(data1$formal.score) hist(data1$age/365) data1$new_mmt <- data1$mmt - 1 hist(data1$new_mmt, prob=TRUE) +data3 <- subset(data1, data1$old_milestones > 0 ) +data3$formal.score <- data3$mmt / (data3$old_milestones/data3$age) + data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -data1$formal.score <- data1$mmt / (data1$milestones/data1$new.age) +data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) hist(as.numeric(data1$new.age)) -table(data1$formal.score) -hist(data1$formal.score) -kmodel1 <- lm(up.fac.mean ~ mmt, data=data1) -summary(kmodel1) -kmodel1 <- lm(up.fac.mean ~ new_mmt, data=data1) -summary(kmodel1) -kmodel1 <- lm(up.fac.mean ~ new.age, data=data1) -summary(kmodel1) -kmodel1 <- lm(up.fac.mean ~ milestones, data=data1) -summary(kmodel1) +table(data1$new.age) +hist(data1$new.formal.score) +data5 <- subset(data1, is.finite(data1$formal.score)) +mmtmodel1 <- lm(up.fac.mean ~ mmt + as.factor(new.age), data=data1) +summary(mmtmodel1) +agemodel1 <- lm(up.fac.mean ~ new.age, data=data1) +summary(agemodel1) +msmodel1 <- lm(up.fac.mean ~ old_milestones + as.factor(new.age), data=data1) +summary(msmodel1) +msmodel2 <- lm(up.fac.mean ~ new_milestones, data=data1) +summary(msmodel2) +fsmodel1 <- lm(up.fac.mean ~ formal.score, data=data5) +summary(fsmodel1) +t.test(data3$formal.score) +fsmodel2 <- lm(up.fac.mean ~ new.formal.score, data=data1) +summary(fsmodel2) hist(data1$formal.score) cor.test(data1$formal.score, data1$up.fac.mean) cor.test(data1$mmt, data1$up.fac.mean) cor.test(data1$milestones, data1$up.fac.mean) cor.test(data1$age, data1$up.fac.mean) -g <- ggplot(data1, aes(x=new_mmt, y=up.fac.mean)) + +g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + geom_point() + - geom_smooth() + geom_smooth() + + xlab("MMT") + + ylab("Underproduction Factor") + + theme_bw() +g g data2 <- subset(data1, (data1$age / 365) < 14 ) hist(floor(data2$age)) g <- ggplot(data2, aes(x=mmt, y=up.fac.mean)) + geom_point() + - geom_smooth() + geom_smooth() + + xlab("MMT") + + ylab("Underproduction Factor") + + theme_bw() g data2$yearsOld <- floor(data2$age / 365) @@ -105,16 +123,22 @@ n <- 100 #a guess for necessary sample size (per group) #makeData(10) #DEBUGGING CODE -- you can uncomment this if you want to see it work #<==== +texreg(list(fsmodel2, mmtmodel1, msmodel1), stars=NULL, digits=2, + custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), + custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), + use.packages=FALSE, table=FALSE, ci.force = TRUE) + +coef(fsmodel1) #print("Levels are:") #print(levels(d$source)) powerCheck(n, nSims) #powerCheck2(n, nSims) like doesn't really work #Sample values -powerCheck(100, 1000) -powerCheck(200, 1000) -powerCheck(300, 1000) +powerCheck(300, 1000) +powerCheck(275, 1000) +powerCheck(7000, 1000) powerCheck2(50, 1000) -powerCheck2(200, 1000) -powerCheck2(500, 1000) +powerCheck2(75, 1000) +powerCheck2(900, 1000) diff --git a/R/formal-underprod-ggplot.png b/R/formal-underprod-ggplot.png new file mode 100644 index 0000000..e348326 Binary files /dev/null and b/R/formal-underprod-ggplot.png differ diff --git a/R/mess-mmt-ggplot.png b/R/mess-mmt-ggplot.png new file mode 100644 index 0000000..0f8170f Binary files /dev/null and b/R/mess-mmt-ggplot.png differ diff --git a/R/mmt-underprod-ggplot.png b/R/mmt-underprod-ggplot.png new file mode 100644 index 0000000..ad7cde1 Binary files /dev/null and b/R/mmt-underprod-ggplot.png differ diff --git a/R/newmmt-underprod-plot.png b/R/newmmt-underprod-plot.png new file mode 100644 index 0000000..84e9010 Binary files /dev/null and b/R/newmmt-underprod-plot.png differ diff --git a/R/powerAnalysis.R b/R/powerAnalysis.R index 2c6f0ce..9c6138f 100644 --- a/R/powerAnalysis.R +++ b/R/powerAnalysis.R @@ -37,7 +37,8 @@ makeDataNew2 <- function(n) { tDF <- data.frame( ## don't sim the outcome #up.fac.mean=rnorm(n=n, mean=-0.1296376, sd=1.479847), # up.fac.mean - formal.score=rlnorm(n=n, mean=6.220282, sd = 2.544058) # formal.score + #formal.score=rlnorm(n=n, mean=6.220282, sd = 2.544058) # formal.score + formal.score=rbeta(n=n, 1, 3) * 10000 ) tDF[is.na(tDF) | tDF=="Inf"] = NA #sDF <- melt(tDF, id.vars = 0) #AKA the index is the unique id, as far as that goes @@ -58,7 +59,7 @@ powerCheck <- function(n, nSims) { #run a power calculation on the dataset given ## outcome goes here --v # e.g. simData$up.fac.mean <- (usefuleffsizeA * mmt) + (usefuleffsizeB * milestones) + rnorm(n=1, mean=0, sd=1) ##plus some noise #simData$up.fac.mean <- (-2.075 * simData$mmt) + (0.4284 * simData$milestones) + rnorm(n=1, mean=0, sd=1) - simData$up.fac.mean <- (-2.0745 * simData$new_mmt) + (0.4284 * simData$milestones) + rnorm(n=n, mean=0, sd=1) + simData$up.fac.mean <- (-1.38 * simData$new_mmt) + (0.40 * simData$milestones) + rnorm(n=n, mean=0, sd=1) #have updated for kkex through here, now need to look at the underproduction work #m1.sim <- lm(up.fac.mean ~ ((mmt)/ (milestones/age)), data=simData) ## could leave age out for now? @@ -88,7 +89,7 @@ powerCheck2 <- function(n, nSims) { #run a power calculation on the dataset give #have updated for kkex through here, now need to look at the underproduction work #m1.sim <- lm(up.fac.mean ~ ((mmt)/ (milestones/age)), data=simData) ## outcome goes here --v - simData$up.fac.mean <- (0.5 * simData$formal.score) + rnorm(1, mean=0, sd=1) ##plus some noise + simData$up.fac.mean <- (0.00017 * simData$formal.score) + rnorm(n, mean=0, sd=1) ##plus some noise m1.sim <- lm(up.fac.mean ~ formal.score, data=simData) p0 <- coef(summary(m1.sim))[1,4] p1 <- coef(summary(m1.sim))[2,4] diff --git a/R/texreg_1.39.3.tar.gz b/R/texreg_1.39.3.tar.gz new file mode 100644 index 0000000..b8a5548 Binary files /dev/null and b/R/texreg_1.39.3.tar.gz differ diff --git a/R/textreg_0.1.5.tar.gz b/R/textreg_0.1.5.tar.gz new file mode 100644 index 0000000..25aa698 Binary files /dev/null and b/R/textreg_0.1.5.tar.gz differ diff --git a/kkex-github-api-key.rtf b/kkex-github-api-key.rtf new file mode 100644 index 0000000..699d182 --- /dev/null +++ b/kkex-github-api-key.rtf @@ -0,0 +1,8 @@ +{\rtf1\ansi\ansicpg1252\cocoartf2708 +\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\margl1440\margr1440\vieww11520\viewh8400\viewkind0 +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 + +\f0\fs24 \cf0 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} \ No newline at end of file