method='lm', formula= y~x, se=FALSE)+
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
theme_bw() +
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
g2
data1$new.age.factor <- factor(data1$new.age, levels=c(1,2,3,4), labels=c("0-9y", "9-12y", "12-15y","15-16y"))
g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
method='lm', formula= y~x, se=FALSE)+
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
theme_bw() +
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
g2
library(readr)
library(ggplot2)
library(tidyverse)
data7 <- read_csv('../final_data/kk_final_octo.csv', show_col_types = FALSE)
median(data7$underproduction_mean)
length(which(data7$underproduction_low < 0))
364 / 3843
data5 <- read_csv('../kk_final_readme_roster.csv', show_col_types=FALSE)
data5 <- read_csv('..final_data/kk_final_readme_roster.csv', show_col_types=FALSE)
data5 <- read_csv('../final_data/kk_final_readme_roster.csv', show_col_types=FALSE)
length(which(data5$underproduction_low < 0))
227/2695
#primary analysis for cross-sectional community metrics
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
rm(list=ls())
set.seed(424242)
library(readr)
library(ggplot2)
library(tidyverse)
#primary analysis for cross-sectional community metrics
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
overall_data$mmt <- (((oveall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
overall_data$mmt <- (((overall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
mean(overall_data1$mmt)
mean(overall_data$mmt)
hist(overall_data$mmt, probability = TRUE)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data1$new.age)
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data1$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
hist(overall_data$new.age.factor)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age.factor)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age.factor)
hist(overall_data$new.age)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,13,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,15,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,14,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,14,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,14,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,13,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
age_vector <- overall_data$age_of_project/365
order(age_vector)
order(age_vector)
quartile(age_vector)
quantile(age_vector)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)
1159/5105
1391/5105
1277/5105
1276/510
1276/5105
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g3
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_point(mapping = aes(color=new.age.factor)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g3
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g3
mmtmodel1 <- lm(up.fac.mean ~ mmt + new.age.factor, data=overall_data)
mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data)
summary(mmtmodel1)
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
g3
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
g3
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
g4
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.0, 0.0), legend.justification = c("left", "bottom"))
g4
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.0, 0.0), legend.justification = c("right", "top"))
g4
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(1.0, 1.0), legend.justification = c("right", "top"))
g4
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.9, 1.0), legend.justification = c("right", "top"))
g4
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
g4
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw() +
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
g4
min(overall_data$underproduction_mean)
max(overall_data$underproduction_mean)
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
999 / 3842
1139/3842
955/3842
747/3842
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(overall_data$new.age)
hist(octo_data$new.age)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(overall_data$mmt)
mean(octo_data$mmt)
hist(octo_data$mmt, probability = TRUE)
head(octo_data)
octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count) / (octo_data$api_contrib_count))
hist(octo_data$issue_mmt, probability = TRUE)
octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count))
hist(octo_data$issue_mmt, probability = TRUE)
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
summary(issue_mmtmodel1)
octo_data$wiki_mmt <- (((octo_data$wiki_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count))
hist(octo_data$wiki_mmt, probability = TRUE)
wiki_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
summary(wiki_mmtmodel1)
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
summary(wiki_mmtmodel1)
texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'MMT (Overall Dataset)'),
custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
source('powerAnalysis.R') #my little "lib"
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'MMT (Overall Dataset)'),
custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'),
use.packages=FALSE, table=TRUE, ci.force = TRUE)
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
#below here is the analysis for the readme data
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(readme_data$new.age)
readme_data$new.age.factor <- as.factor(readme_data$new.age)
hist(readme_data$new.age)
637 / 2694
676 / 2694
725 / 2694
656 / 2694
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
#below here is the analysis for the contributing.md files
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(readme_data$new.age)
readme_data$new.age.factor <- as.factor(readme_data$new.age)
#below here is the analysis for the contributing.md files
contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(contributing_data$new.age)
contributing_data$new.age.factor <- as.factor(contributing_data$new.age)
hist(contributing_data$new.age)
76/528
119 / 528
171/ 528
162 / 528
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
rm(list=ls())
set.seed(424242)
library(readr)
library(ggplot2)
library(tidyverse)
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(octo_data$new.age)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt, probability = TRUE)
head(octo_data)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib))
hist(octo_data$issue_mmt, probability = TRUE)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt, probability = TRUE)
max(octo_data$issue_mmt)
max(octo_data$issue_mmt)
median(octo_data$issue_mmt)
median(octo_data$issue_mmt)
min(octo_data$issue_mmt)
hist(octo_data$total_contrib)
mean(octo_data$total_contrib)
median(octo_data$total_contrib)
median(octo_data$contributors)
median(octo_data$collaborators)
median(octo_data$total_contrib)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt, probability = TRUE)
hist(octo_data$issue_mmt)
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
min(octo_data$wiki_mmt)
median(octo_data$wiki_mmt)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
median(octo_data$total_contrib)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
max(octo_data$issue_mmt)
maximum(octo_data$issue_mmt)
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
median(octo_data$wiki_mmt)
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
maximum(octo_data$issue_mmt)
typeof(octo_data$issue_mmt)
length(octo_data$issue_mmt)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
length(octo_data$issue_mmt)
sum(octo_data$issue_mmt > 2)
length(octo_data$issue_mmt > 2)
length(octo_data$issue_mmt > 2.0)
median(octo_data$wiki_mmt)
typeof(octo_data$issue_mmt)
median(octo_data$issue_mmt, na.rm = TRUE)
median(octo_data$issue_contrib_count)
octo_data <- na.omit(octo_data$issue_contrib_count)
median(octo_data$issue_contrib_count)
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(octo_data$new.age)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
head(octo_data)
median(octo_data$issue_contrib_count)
octo_data <- na.omit(octo_data)
median(octo_data$issue_contrib_count)
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
#i.e. needs to be a total contrib number that is not attached to the high level counts
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
median(octo_data$issue_mmt, na.rm = TRUE)
length(octo_data$issue_mmt > 2.0)
length(octo_data$issue_mmt > 2.0)
length(octo_data$issue_mmt > 2)
median(octo_data$issue_mmt)
, na.rm = TRUE
median(octo_data$issue_mmt, na.rm = TRUE)
length(octo_data$issue_mmt > 2)
length(octo_data$issue_mmt > 2)
length(octo_data$issue_mmt > 2.0)
max(octo_data$issue_mmt, na.rm = TRUE)
octo_data$new_mmt <- (((octo_data$collaborators * 2)+ (octo_data$total_contrib - octo_data$collaborators)) / (octo_data$total_contrib))
hist(octo_data$new_mmt)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
hist(octo_data$issue_mmt)
length(octo_data$issue_mmt > 2.0)
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib)]
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count
octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE)
# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(octo_data$new.age)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
head(octo_data)
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE)
# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(octo_data$new.age)
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
head(octo_data)
octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),]
#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$issue_mmt)
max(octo_data$issue_mmt, na.rm = TRUE)
length(octo_data$issue_mmt > 2.0)
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
summary(issue_mmtmodel1)
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
summary(wiki_mmtmodel1)
write.csv(octo_data, "new_octo.csv", row.names = FALSE)
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
qqnorm(octo_data$issue_mmt)
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
median(octo_data$wiki_mmt)
qqnorm(octo_data$wiki_mmt)
qqnorm(octo_data$issue_mmt)
qqnorm(octo_data$wiki_mmt)
qqnorm(log(octo_data$issue_mmt))
qqnorm(octo_data$issue_mmt)
qqnorm(log(octo_data$issue_mmt))
qqnorm(octo_data$issue_mmt)
qqnorm(log(octo_data$issue_mmt))
qqnorm(residuals(octo_data$issue_mmt))
qqnorm(octo_data$issue_mmt)
qqnorm(log(octo_data$issue_mmt))
qqnorm(octo_data$issue_mmt)
hist(log(octo_data$issue_mmt))
hist(sqrt(octo_data$issue_mmt))
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)
# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(octo_data$new.age)
#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)
hist(sqrt(octo_data$issue_mmt))
hist(sqrt(octo_data$issue_mmt))
hist(octo_data$issue_mmt)
#right skewed data, need to transform
library(rcompanion)
install.packages(rcompanion)
hist(sqrt(octo_data$issue_mmt))
qqnorm(1/octo_data$issue_mmt)
hist(1/octo_data$issue_mmt)
hist(log(octo_data$issue_mmt))
hist(sqrt(octo_data$issue_mmt))
hist(log(octo_data$issue_mmt))
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data)
summary(sqrt_issue_mmtmodel1)
summary(issue_mmtmodel1)
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
summary(wiki_mmtmodel1)
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 5)
g3
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05)
g3
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05) + theme_bw()
g3
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
g3
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
g2
g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
g1
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
g3
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
g2
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
source('powerAnalysis.R') #my little "lib"
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
library(texreg) #my little "lib"
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ),
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
use.packages=FALSE, table=FALSE, ci.force = TRUE)