method='lm', formula= y~x, se=FALSE)+ labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + theme_bw() + theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) g2 data1$new.age.factor <- factor(data1$new.age, levels=c(1,2,3,4), labels=c("0-9y", "9-12y", "12-15y","15-16y")) g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), method='lm', formula= y~x, se=FALSE)+ labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + theme_bw() + theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) g2 library(readr) library(ggplot2) library(tidyverse) data7 <- read_csv('../final_data/kk_final_octo.csv', show_col_types = FALSE) median(data7$underproduction_mean) length(which(data7$underproduction_low < 0)) 364 / 3843 data5 <- read_csv('../kk_final_readme_roster.csv', show_col_types=FALSE) data5 <- read_csv('..final_data/kk_final_readme_roster.csv', show_col_types=FALSE) data5 <- read_csv('../final_data/kk_final_readme_roster.csv', show_col_types=FALSE) length(which(data5$underproduction_low < 0)) 227/2695 #primary analysis for cross-sectional community metrics overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) rm(list=ls()) set.seed(424242) library(readr) library(ggplot2) library(tidyverse) #primary analysis for cross-sectional community metrics overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) overall_data$mmt <- (((oveall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) overall_data$mmt <- (((overall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) mean(overall_data1$mmt) mean(overall_data$mmt) hist(overall_data$mmt, probability = TRUE) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) table(data1$new.age) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data1$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) hist(overall_data$new.age.factor) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age.factor) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age.factor) hist(overall_data$new.age) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,13,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,15,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,14,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,14,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,14,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,13,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) age_vector <- overall_data$age_of_project/365 order(age_vector) order(age_vector) quartile(age_vector) quantile(age_vector) overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(overall_data$new.age) overall_data$new.age.factor <- as.factor(overall_data$new.age) hist(overall_data$new.age) 1159/5105 1391/5105 1277/5105 1276/510 1276/5105 #shows the cross-age downward slopes for all underproduction averages in the face of MMT g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), method='lm', formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() g3 #shows the cross-age downward slopes for all underproduction averages in the face of MMT g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_point(mapping = aes(color=new.age.factor)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), method='lm', formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() g3 #shows the cross-age downward slopes for all underproduction averages in the face of MMT g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), method='lm', formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() g3 mmtmodel1 <- lm(up.fac.mean ~ mmt + new.age.factor, data=overall_data) mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data) summary(mmtmodel1) #shows the cross-age downward slopes for all underproduction averages in the face of MMT g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), method='lm', formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) g3 #shows the cross-age downward slopes for all underproduction averages in the face of MMT g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) g3 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) g4 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.0, 0.0), legend.justification = c("left", "bottom")) g4 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.0, 0.0), legend.justification = c("right", "top")) g4 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(1.0, 1.0), legend.justification = c("right", "top")) g4 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.9, 1.0), legend.justification = c("right", "top")) g4 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top")) g4 g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() + theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top")) g4 min(overall_data$underproduction_mean) max(overall_data$underproduction_mean) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) 999 / 3842 1139/3842 955/3842 747/3842 octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(overall_data$new.age) hist(octo_data$new.age) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(overall_data$mmt) mean(octo_data$mmt) hist(octo_data$mmt, probability = TRUE) head(octo_data) octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count) / (octo_data$api_contrib_count)) hist(octo_data$issue_mmt, probability = TRUE) octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count)) hist(octo_data$issue_mmt, probability = TRUE) octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) summary(octo_mmtmodel1) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) summary(issue_mmtmodel1) octo_data$wiki_mmt <- (((octo_data$wiki_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count)) hist(octo_data$wiki_mmt, probability = TRUE) wiki_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) summary(wiki_mmtmodel1) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) summary(wiki_mmtmodel1) texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'MMT (Overall Dataset)'), custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'), use.packages=FALSE, table=FALSE, ci.force = TRUE) source('powerAnalysis.R') #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), use.packages=FALSE, table=FALSE, ci.force = TRUE) texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'MMT (Overall Dataset)'), custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'), use.packages=FALSE, table=TRUE, ci.force = TRUE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) #below here is the analysis for the readme data readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(readme_data$new.age) readme_data$new.age.factor <- as.factor(readme_data$new.age) hist(readme_data$new.age) 637 / 2694 676 / 2694 725 / 2694 656 / 2694 contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) #below here is the analysis for the contributing.md files readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(readme_data$new.age) readme_data$new.age.factor <- as.factor(readme_data$new.age) #below here is the analysis for the contributing.md files contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(contributing_data$new.age) contributing_data$new.age.factor <- as.factor(contributing_data$new.age) hist(contributing_data$new.age) 76/528 119 / 528 171/ 528 162 / 528 octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) rm(list=ls()) set.seed(424242) library(readr) library(ggplot2) library(tidyverse) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(octo_data$new.age) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt, probability = TRUE) head(octo_data) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)) hist(octo_data$issue_mmt, probability = TRUE) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt, probability = TRUE) max(octo_data$issue_mmt) max(octo_data$issue_mmt) median(octo_data$issue_mmt) median(octo_data$issue_mmt) min(octo_data$issue_mmt) hist(octo_data$total_contrib) mean(octo_data$total_contrib) median(octo_data$total_contrib) median(octo_data$contributors) median(octo_data$collaborators) median(octo_data$total_contrib) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt, probability = TRUE) hist(octo_data$issue_mmt) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) min(octo_data$wiki_mmt) median(octo_data$wiki_mmt) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) median(octo_data$total_contrib) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) max(octo_data$issue_mmt) maximum(octo_data$issue_mmt) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) median(octo_data$wiki_mmt) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) summary(octo_mmtmodel1) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) maximum(octo_data$issue_mmt) typeof(octo_data$issue_mmt) length(octo_data$issue_mmt) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) length(octo_data$issue_mmt) sum(octo_data$issue_mmt > 2) length(octo_data$issue_mmt > 2) length(octo_data$issue_mmt > 2.0) median(octo_data$wiki_mmt) typeof(octo_data$issue_mmt) median(octo_data$issue_mmt, na.rm = TRUE) median(octo_data$issue_contrib_count) octo_data <- na.omit(octo_data$issue_contrib_count) median(octo_data$issue_contrib_count) octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(octo_data$new.age) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) head(octo_data) median(octo_data$issue_contrib_count) octo_data <- na.omit(octo_data) median(octo_data$issue_contrib_count) #TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts #i.e. needs to be a total contrib number that is not attached to the high level counts octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) median(octo_data$issue_mmt, na.rm = TRUE) length(octo_data$issue_mmt > 2.0) length(octo_data$issue_mmt > 2.0) length(octo_data$issue_mmt > 2) median(octo_data$issue_mmt) , na.rm = TRUE median(octo_data$issue_mmt, na.rm = TRUE) length(octo_data$issue_mmt > 2) length(octo_data$issue_mmt > 2) length(octo_data$issue_mmt > 2.0) max(octo_data$issue_mmt, na.rm = TRUE) octo_data$new_mmt <- (((octo_data$collaborators * 2)+ (octo_data$total_contrib - octo_data$collaborators)) / (octo_data$total_contrib)) hist(octo_data$new_mmt) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) #TODO: there's an issue with calculating this but somehow not an issue with the wiki one octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) hist(octo_data$issue_mmt) length(octo_data$issue_mmt > 2.0) octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib)] octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(octo_data$new.age) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) head(octo_data) octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(octo_data$new.age) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) head(octo_data) octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),] #TODO: there's an issue with calculating this but somehow not an issue with the wiki one octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) hist(octo_data$issue_mmt) max(octo_data$issue_mmt, na.rm = TRUE) length(octo_data$issue_mmt > 2.0) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) summary(issue_mmtmodel1) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) summary(wiki_mmtmodel1) write.csv(octo_data, "new_octo.csv", row.names = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) qqnorm(octo_data$issue_mmt) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) median(octo_data$wiki_mmt) qqnorm(octo_data$wiki_mmt) qqnorm(octo_data$issue_mmt) qqnorm(octo_data$wiki_mmt) qqnorm(log(octo_data$issue_mmt)) qqnorm(octo_data$issue_mmt) qqnorm(log(octo_data$issue_mmt)) qqnorm(octo_data$issue_mmt) qqnorm(log(octo_data$issue_mmt)) qqnorm(residuals(octo_data$issue_mmt)) qqnorm(octo_data$issue_mmt) qqnorm(log(octo_data$issue_mmt)) qqnorm(octo_data$issue_mmt) hist(log(octo_data$issue_mmt)) hist(sqrt(octo_data$issue_mmt)) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) summary(octo_mmtmodel1) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) summary(octo_mmtmodel1) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(octo_data$new.age) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) summary(octo_mmtmodel1) hist(sqrt(octo_data$issue_mmt)) hist(sqrt(octo_data$issue_mmt)) hist(octo_data$issue_mmt) #right skewed data, need to transform library(rcompanion) install.packages(rcompanion) hist(sqrt(octo_data$issue_mmt)) qqnorm(1/octo_data$issue_mmt) hist(1/octo_data$issue_mmt) hist(log(octo_data$issue_mmt)) hist(sqrt(octo_data$issue_mmt)) hist(log(octo_data$issue_mmt)) octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt) sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data) summary(sqrt_issue_mmtmodel1) summary(issue_mmtmodel1) octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) summary(wiki_mmtmodel1) g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 5) g3 g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05) g3 g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05) + theme_bw() g3 g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() g3 g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() g2 g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() g1 g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() g3 g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() g2 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), use.packages=FALSE, table=FALSE, ci.force = TRUE) source('powerAnalysis.R') #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), use.packages=FALSE, table=FALSE, ci.force = TRUE) library(texreg) #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), use.packages=FALSE, table=FALSE, ci.force = TRUE) texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), use.packages=FALSE, table=FALSE, ci.force = TRUE)