rm(list=ls())
set.seed(424242) 

library(readr)
library(ggplot2)
library(tidyverse)

#primary analysis for cross-sectional community metrics 
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)

overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
mean(overall_data$mmt)
hist(overall_data$mmt, probability = TRUE)

#age_vector <- overall_data$age_of_project/365
#quantile(age_vector)
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(overall_data$new.age)
overall_data$new.age.factor <- as.factor(overall_data$new.age)
hist(overall_data$new.age)

mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data)
summary(mmtmodel1)

#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
  geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) +
  xlab("MMT") +
  ylab("Underproduction Factor") +
  theme_bw() +
  theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
g4

texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
       custom.model.names=c( 'MMT (Overall Dataset)'), 
       custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'), 
       use.packages=FALSE, table=TRUE, ci.force = TRUE)

# below this is the analysis for the octo data
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(octo_data$new.age)
octo_data$new.age.factor <- as.factor(octo_data$new.age)
hist(octo_data$new.age)

octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
mean(octo_data$mmt)
hist(octo_data$mmt)
head(octo_data)

octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),]
write.csv(octo_data, "new_octo.csv", row.names = FALSE)

#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() 
g2
g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() 
g1
#right skewed data, need to transform

octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
hist(octo_data$wiki_mmt)
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() 
g3
median(octo_data$wiki_mmt)
qqnorm(octo_data$wiki_mmt)
#left skewed data, need to transform


#below are the models for the octo data, there should be analysis for each one
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
summary(octo_mmtmodel1)

issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
summary(issue_mmtmodel1)

sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data)
summary(sqrt_issue_mmtmodel1)

wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
summary(wiki_mmtmodel1)

library(texreg) #my little "lib"

texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
       custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ), 
       custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), 
       use.packages=FALSE, table=FALSE, ci.force = TRUE)

#below here is the analysis for the readme.md data 
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(readme_data$new.age)
readme_data$new.age.factor <- as.factor(readme_data$new.age)
hist(readme_data$new.age)


#below here is the analysis for the contributing.md files
contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
table(contributing_data$new.age)
contributing_data$new.age.factor <- as.factor(contributing_data$new.age)
hist(contributing_data$new.age)