diff --git a/R/.DS_Store b/R/.DS_Store index 5008ddf..b821459 100644 Binary files a/R/.DS_Store and b/R/.DS_Store differ diff --git a/R/.Rhistory b/R/.Rhistory index aa3d73e..4db6dfa 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,256 +1,3 @@ -geom_point() + -geom_smooth(mapping = aes(x=age_of_project, y=total_community))+ -xlab("Age of the Project") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x= (age_of_the_project /12), y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project, y=total_community))+ -xlab("Age of the Project") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x=age_of_the_project, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project, y=total_community))+ -xlab("Age of the Project") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x=age_of_the_project, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project, y=total_community))+ -xlab("Age of the Project") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/12, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project, y=total_community))+ -xlab("Age of the Proje") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/12, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/12, y=total_community))+ -xlab("Age of the Proje") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+ -xlab("Age of the Proje") + -ylab("Underproduction Factor") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community, color=yellow))+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") + -theme_bw() -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="black")+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") + -theme_bw() -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="yellow")+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") + -theme_bw() -g4 -g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + -geom_point() + -geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="red")+ -xlab("Age of the Project (years)") + -ylab("Contributor Community Population") + -theme_bw() -g4 -library(readr) -data6 <-read_csv('../kk_final_commentlist.csv', show_col_types=FALSE) -data6$total_community = data6$contributors + data6$collaborators -median(data6$total_community) -cor.test(data6$total_community, data6$age_of_project) -library(readr) -library(ggplot2) -data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) -data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) -data1$old_milestones <- data1$milestones -data1$new_milestones <- as.numeric(data1$milestones > 0) + 1 -data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) -data1$new.age.factor <- as.factor(data1$new.age) -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), -method='lm', formula= y~x) -g2 -data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) -data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) -data2 <- read_csv('../inst_all_packages_full_results.csv') -#d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) -#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors") -python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)]) -same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)]) -data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) -data1$old_milestones <- data1$milestones -data1$new_milestones <- as.numeric(data1$milestones > 0) + 1 -data1$new.age.factor <- as.factor(data1$new.age) -data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) -data1$new.age.factor <- as.factor(data1$new.age) -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), -method='lm', formula= y~x) -g2 -data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE) -data2 <- read_csv('../inst_all_packages_full_results.csv') -#d$nd <- to_logical(d$not.damaging, custom_true=c("Y")) -#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors") -python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)]) -same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)]) -data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE) -data1$old_milestones <- data1$milestones -data1$new_milestones <- as.numeric(data1$milestones > 0) + 1 -# (2) - Run the model on the pilot data -data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age) -data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x) -g2 -data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) -data1$new.age.factor <- as.factor(data1$new.age) -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() -g2 -g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -#geom_smooth( method="lm", formula=(y~x), colour = "orange")+ -geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange", size=1)+ -geom_errorbar(aes(ymin=y-yerr, ymax=y+yerr), width=0.09)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -theme_bw() -g -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -theme_bw() -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_point() + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=color_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual( labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -scale_colour_manual(values=legend.values, labels=c("0-9y", "9-12y", "12-15y","15-16y")) + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), method='lm', formula= y~x, se=FALSE)+ labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + theme_bw() + @@ -510,3 +257,256 @@ hist(contributing_data$new.age) 119 / 528 171/ 528 162 / 528 +octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) +rm(list=ls()) +set.seed(424242) +library(readr) +library(ggplot2) +library(tidyverse) +readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) +octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) +# below this is the analysis for the octo data +octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) +# below this is the analysis for the octo data +octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +table(octo_data$new.age) +octo_data$new.age.factor <- as.factor(octo_data$new.age) +hist(octo_data$new.age) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +mean(octo_data$mmt) +hist(octo_data$mmt, probability = TRUE) +head(octo_data) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)) +hist(octo_data$issue_mmt, probability = TRUE) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt, probability = TRUE) +max(octo_data$issue_mmt) +max(octo_data$issue_mmt) +median(octo_data$issue_mmt) +median(octo_data$issue_mmt) +min(octo_data$issue_mmt) +hist(octo_data$total_contrib) +mean(octo_data$total_contrib) +median(octo_data$total_contrib) +median(octo_data$contributors) +median(octo_data$collaborators) +median(octo_data$total_contrib) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt, probability = TRUE) +hist(octo_data$issue_mmt) +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$wiki_mmt) +min(octo_data$wiki_mmt) +median(octo_data$wiki_mmt) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +mean(octo_data$mmt) +hist(octo_data$mmt) +median(octo_data$total_contrib) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +max(octo_data$issue_mmt) +maximum(octo_data$issue_mmt) +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$wiki_mmt) +median(octo_data$wiki_mmt) +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) +summary(octo_mmtmodel1) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +maximum(octo_data$issue_mmt) +typeof(octo_data$issue_mmt) +length(octo_data$issue_mmt) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +length(octo_data$issue_mmt) +sum(octo_data$issue_mmt > 2) +length(octo_data$issue_mmt > 2) +length(octo_data$issue_mmt > 2.0) +median(octo_data$wiki_mmt) +typeof(octo_data$issue_mmt) +median(octo_data$issue_mmt, na.rm = TRUE) +median(octo_data$issue_contrib_count) +octo_data <- na.omit(octo_data$issue_contrib_count) +median(octo_data$issue_contrib_count) +octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) +# below this is the analysis for the octo data +octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +table(octo_data$new.age) +octo_data$new.age.factor <- as.factor(octo_data$new.age) +hist(octo_data$new.age) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +mean(octo_data$mmt) +hist(octo_data$mmt) +head(octo_data) +median(octo_data$issue_contrib_count) +octo_data <- na.omit(octo_data) +median(octo_data$issue_contrib_count) +#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts +#i.e. needs to be a total contrib number that is not attached to the high level counts +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +median(octo_data$issue_mmt, na.rm = TRUE) +length(octo_data$issue_mmt > 2.0) +length(octo_data$issue_mmt > 2.0) +length(octo_data$issue_mmt > 2) +median(octo_data$issue_mmt) +, na.rm = TRUE +median(octo_data$issue_mmt, na.rm = TRUE) +length(octo_data$issue_mmt > 2) +length(octo_data$issue_mmt > 2) +length(octo_data$issue_mmt > 2.0) +max(octo_data$issue_mmt, na.rm = TRUE) +octo_data$new_mmt <- (((octo_data$collaborators * 2)+ (octo_data$total_contrib - octo_data$collaborators)) / (octo_data$total_contrib)) +hist(octo_data$new_mmt) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +mean(octo_data$mmt) +hist(octo_data$mmt) +#TODO: there's an issue with calculating this but somehow not an issue with the wiki one +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$wiki_mmt) +hist(octo_data$issue_mmt) +length(octo_data$issue_mmt > 2.0) +octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib)] +octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] +octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count +octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE) +# below this is the analysis for the octo data +octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +table(octo_data$new.age) +octo_data$new.age.factor <- as.factor(octo_data$new.age) +hist(octo_data$new.age) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +mean(octo_data$mmt) +hist(octo_data$mmt) +head(octo_data) +octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count +octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] +octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] +octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE) +# below this is the analysis for the octo data +octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +table(octo_data$new.age) +octo_data$new.age.factor <- as.factor(octo_data$new.age) +hist(octo_data$new.age) +octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) +mean(octo_data$mmt) +hist(octo_data$mmt) +head(octo_data) +octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),] +#TODO: there's an issue with calculating this but somehow not an issue with the wiki one +octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$issue_mmt) +max(octo_data$issue_mmt, na.rm = TRUE) +length(octo_data$issue_mmt > 2.0) +issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) +summary(issue_mmtmodel1) +wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) +summary(wiki_mmtmodel1) +write.csv(octo_data, "new_octo.csv", row.names = FALSE) +octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) +qqnorm(octo_data$issue_mmt) +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$wiki_mmt) +median(octo_data$wiki_mmt) +qqnorm(octo_data$wiki_mmt) +qqnorm(octo_data$issue_mmt) +qqnorm(octo_data$wiki_mmt) +qqnorm(log(octo_data$issue_mmt)) +qqnorm(octo_data$issue_mmt) +qqnorm(log(octo_data$issue_mmt)) +qqnorm(octo_data$issue_mmt) +qqnorm(log(octo_data$issue_mmt)) +qqnorm(residuals(octo_data$issue_mmt)) +qqnorm(octo_data$issue_mmt) +qqnorm(log(octo_data$issue_mmt)) +qqnorm(octo_data$issue_mmt) +hist(log(octo_data$issue_mmt)) +hist(sqrt(octo_data$issue_mmt)) +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) +summary(octo_mmtmodel1) +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) +summary(octo_mmtmodel1) +# below this is the analysis for the octo data +octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) +table(octo_data$new.age) +octo_data$new.age.factor <- as.factor(octo_data$new.age) +hist(octo_data$new.age) +#below are the models for the octo data, there should be analysis for each one +octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) +summary(octo_mmtmodel1) +hist(sqrt(octo_data$issue_mmt)) +hist(sqrt(octo_data$issue_mmt)) +hist(octo_data$issue_mmt) +#right skewed data, need to transform +library(rcompanion) +install.packages(rcompanion) +hist(sqrt(octo_data$issue_mmt)) +qqnorm(1/octo_data$issue_mmt) +hist(1/octo_data$issue_mmt) +hist(log(octo_data$issue_mmt)) +hist(sqrt(octo_data$issue_mmt)) +hist(log(octo_data$issue_mmt)) +octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt) +sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data) +summary(sqrt_issue_mmtmodel1) +summary(issue_mmtmodel1) +octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) +hist(octo_data$wiki_mmt) +wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) +summary(wiki_mmtmodel1) +g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 5) +g3 +g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05) +g3 +g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05) + theme_bw() +g3 +g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() +g3 +g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() +g2 +g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() +g1 +g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() +g3 +g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw() +g2 +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +source('powerAnalysis.R') #my little "lib" +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +library(texreg) #my little "lib" +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), +custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, +custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), +custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), +use.packages=FALSE, table=FALSE, ci.force = TRUE) diff --git a/R/GovRiskPower.R b/R/GovRiskPower.R index 548a99c..2a9982b 100644 --- a/R/GovRiskPower.R +++ b/R/GovRiskPower.R @@ -44,14 +44,14 @@ octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.5 table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) hist(octo_data$new.age) +length(which(octo_data$underproduction_low < 0)) +median(octo_data$underproduction_mean) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) head(octo_data) -octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),] -write.csv(octo_data, "new_octo.csv", row.names = FALSE) #TODO: there's an issue with calculating this but somehow not an issue with the wiki one octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) @@ -70,6 +70,8 @@ median(octo_data$wiki_mmt) qqnorm(octo_data$wiki_mmt) #left skewed data, need to transform +g4 <- ggplot(octo_data) +g4 #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) @@ -84,6 +86,7 @@ summary(sqrt_issue_mmtmodel1) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) summary(wiki_mmtmodel1) + library(texreg) #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, diff --git a/R/didAnalyses.R b/R/didAnalyses.R new file mode 100644 index 0000000..fa41543 --- /dev/null +++ b/R/didAnalyses.R @@ -0,0 +1,4 @@ +library(tidyverse) + +#set wd +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) diff --git a/R/0119-final-mmt.png b/R/plots/0119-final-mmt.png similarity index 100% rename from R/0119-final-mmt.png rename to R/plots/0119-final-mmt.png diff --git a/R/630_0119_final.png b/R/plots/630_0119_final.png similarity index 100% rename from R/630_0119_final.png rename to R/plots/630_0119_final.png diff --git a/R/cs497_final_plot.png b/R/plots/cs497_final_plot.png similarity index 100% rename from R/cs497_final_plot.png rename to R/plots/cs497_final_plot.png diff --git a/R/data_subset_agegroup.png b/R/plots/data_subset_agegroup.png similarity index 100% rename from R/data_subset_agegroup.png rename to R/plots/data_subset_agegroup.png diff --git a/R/final-mmt-plot.png b/R/plots/final-mmt-plot.png similarity index 100% rename from R/final-mmt-plot.png rename to R/plots/final-mmt-plot.png diff --git a/R/final-mmt-underprod-final-last.png b/R/plots/final-mmt-underprod-final-last.png similarity index 100% rename from R/final-mmt-underprod-final-last.png rename to R/plots/final-mmt-underprod-final-last.png diff --git a/R/formal-underprod-ggplot.png b/R/plots/formal-underprod-ggplot.png similarity index 100% rename from R/formal-underprod-ggplot.png rename to R/plots/formal-underprod-ggplot.png diff --git a/R/mess-mmt-ggplot.png b/R/plots/mess-mmt-ggplot.png similarity index 100% rename from R/mess-mmt-ggplot.png rename to R/plots/mess-mmt-ggplot.png diff --git a/R/mmt-underprod-ggplot.png b/R/plots/mmt-underprod-ggplot.png similarity index 100% rename from R/mmt-underprod-ggplot.png rename to R/plots/mmt-underprod-ggplot.png diff --git a/R/new_mmt_underprod_plot.png b/R/plots/new_mmt_underprod_plot.png similarity index 100% rename from R/new_mmt_underprod_plot.png rename to R/plots/new_mmt_underprod_plot.png diff --git a/R/newmmt-underprod-plot.png b/R/plots/newmmt-underprod-plot.png similarity index 100% rename from R/newmmt-underprod-plot.png rename to R/plots/newmmt-underprod-plot.png diff --git a/R/saner24_presentation.png b/R/plots/saner24_presentation.png similarity index 100% rename from R/saner24_presentation.png rename to R/plots/saner24_presentation.png diff --git a/R/temp-mmt-colors.png b/R/plots/temp-mmt-colors.png similarity index 100% rename from R/temp-mmt-colors.png rename to R/plots/temp-mmt-colors.png diff --git a/R/temp-temp.png b/R/plots/temp-temp.png similarity index 100% rename from R/temp-temp.png rename to R/plots/temp-temp.png diff --git a/R/EDA.R b/R/stale_scripts/EDA.R similarity index 100% rename from R/EDA.R rename to R/stale_scripts/EDA.R diff --git a/R/calculatePower.R b/R/stale_scripts/calculatePower.R similarity index 100% rename from R/calculatePower.R rename to R/stale_scripts/calculatePower.R diff --git a/R/justPowerSims.R b/R/stale_scripts/justPowerSims.R similarity index 100% rename from R/justPowerSims.R rename to R/stale_scripts/justPowerSims.R diff --git a/R/plotting_age.R b/R/stale_scripts/plotting_age.R similarity index 100% rename from R/plotting_age.R rename to R/stale_scripts/plotting_age.R diff --git a/R/powerAnalysis.R b/R/stale_scripts/powerAnalysis.R similarity index 100% rename from R/powerAnalysis.R rename to R/stale_scripts/powerAnalysis.R diff --git a/R/powerAnalysis.orig.R b/R/stale_scripts/powerAnalysis.orig.R similarity index 100% rename from R/powerAnalysis.orig.R rename to R/stale_scripts/powerAnalysis.orig.R