updating R org

2024-04-02 18:49:49 -05:00 · 2024-04-02 18:49:49 -05:00 · 66574803a6
commit 66574803a6
parent 80ff60b755
24 changed files with 262 additions and 255 deletions
--- a/R/.DS_Store
+++ b/R/.DS_Store
--- a/R/.Rhistory
+++ b/R/.Rhistory
@ -1,256 +1,3 @@
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project, y=total_community))+
-xlab("Age of the Project") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x= (age_of_the_project /12), y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project, y=total_community))+
-xlab("Age of the Project") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x=age_of_the_project, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project, y=total_community))+
-xlab("Age of the Project") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x=age_of_the_project, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project, y=total_community))+
-xlab("Age of the Project") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/12, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project, y=total_community))+
-xlab("Age of the Proje") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/12, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/12, y=total_community))+
-xlab("Age of the Proje") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+
-xlab("Age of the Proje") +
-ylab("Underproduction Factor")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community, color=yellow))+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population")
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community))+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population") +
-theme_bw()
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="black")+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population") +
-theme_bw()
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="yellow")+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population") +
-theme_bw()
-g4
-g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
-geom_point() +
-geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="red")+
-xlab("Age of the Project (years)") +
-ylab("Contributor Community Population") +
-theme_bw()
-g4
-library(readr)
-data6 <-read_csv('../kk_final_commentlist.csv', show_col_types=FALSE)
-data6$total_community = data6$contributors + data6$collaborators
-median(data6$total_community)
-cor.test(data6$total_community, data6$age_of_project)
-library(readr)
-library(ggplot2)
-data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)
-data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
-data1$old_milestones <- data1$milestones
-data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
-data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
-data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
-data1$new.age.factor <- as.factor(data1$new.age)
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
-method='lm', formula= y~x)
-g2
-data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
-data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)
-data2 <- read_csv('../inst_all_packages_full_results.csv')
-#d$nd <- to_logical(d$not.damaging, custom_true=c("Y"))
-#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors")
-python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)])
-same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)])
-data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
-data1$old_milestones <- data1$milestones
-data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
-data1$new.age.factor <- as.factor(data1$new.age)
-data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
-data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
-data1$new.age.factor <- as.factor(data1$new.age)
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
-method='lm', formula= y~x)
-g2
-data1 <- read_csv('../power_data_111023_mmt.csv',show_col_types = FALSE)
-data2 <- read_csv('../inst_all_packages_full_results.csv')
-#d$nd <- to_logical(d$not.damaging, custom_true=c("Y"))
-#levels(d$source) <- c("IP-based Editors", "New Editors", "Registered Editors", "Tor-based Editors")
-python_labeled <- as.numeric(data2$up.fac.mean[match(paste('python',tolower(data1$pkg), sep = "-"), data2$pkg)])
-same_labeled <- as.numeric(data2$up.fac.mean[match(tolower(data1$pkg), data2$pkg)])
-data1$up.fac.mean <- pmin(python_labeled, same_labeled, na.rm=TRUE)
-data1$old_milestones <- data1$milestones
-data1$new_milestones <- as.numeric(data1$milestones > 0) + 1
-# (2) - Run the model on the pilot data
-data1$formal.score <- data1$mmt / (data1$old_milestones/data1$age)
-data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
-data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x)
-g2
-data1$new.age <- as.numeric(cut(data1$age/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
-data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
-data1$new.age.factor <- as.factor(data1$new.age)
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x)
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-xlab("MMT") +
-ylab("Underproduction Factor") +
-theme_bw()
-g2
-g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-#geom_smooth( method="lm", formula=(y~x), colour = "orange")+
-geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange", size=1)+
-geom_errorbar(aes(ymin=y-yerr, ymax=y+yerr), width=0.09)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-theme_bw()
-g
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-theme_bw()
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw()
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_point() +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=colors_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=color_legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual( labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=legend, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
-method='lm', formula= y~x, se=FALSE)+
-labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
-scale_colour_manual(values=legend.values, labels=c("0-9y", "9-12y", "12-15y","15-16y")) +
-theme_bw() +
-theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
-g2
-g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
-geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
 method='lm', formula= y~x, se=FALSE)+
 labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
 theme_bw() +
@ -510,3 +257,256 @@ hist(contributing_data$new.age)
 119 / 528
 171/ 528
 162 / 528
+octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
+rm(list=ls())
+set.seed(424242)
+library(readr)
+library(ggplot2)
+library(tidyverse)
+readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
+octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
+# below this is the analysis for the octo data
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
+# below this is the analysis for the octo data
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+table(octo_data$new.age)
+octo_data$new.age.factor <- as.factor(octo_data$new.age)
+hist(octo_data$new.age)
+octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+mean(octo_data$mmt)
+hist(octo_data$mmt, probability = TRUE)
+head(octo_data)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib))
+hist(octo_data$issue_mmt, probability = TRUE)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt, probability = TRUE)
+max(octo_data$issue_mmt)
+max(octo_data$issue_mmt)
+median(octo_data$issue_mmt)
+median(octo_data$issue_mmt)
+min(octo_data$issue_mmt)
+hist(octo_data$total_contrib)
+mean(octo_data$total_contrib)
+median(octo_data$total_contrib)
+median(octo_data$contributors)
+median(octo_data$collaborators)
+median(octo_data$total_contrib)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt, probability = TRUE)
+hist(octo_data$issue_mmt)
+octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$wiki_mmt)
+min(octo_data$wiki_mmt)
+median(octo_data$wiki_mmt)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+mean(octo_data$mmt)
+hist(octo_data$mmt)
+median(octo_data$total_contrib)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+max(octo_data$issue_mmt)
+maximum(octo_data$issue_mmt)
+octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$wiki_mmt)
+median(octo_data$wiki_mmt)
+#below are the models for the octo data, there should be analysis for each one
+octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
+summary(octo_mmtmodel1)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+maximum(octo_data$issue_mmt)
+typeof(octo_data$issue_mmt)
+length(octo_data$issue_mmt)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+length(octo_data$issue_mmt)
+sum(octo_data$issue_mmt > 2)
+length(octo_data$issue_mmt > 2)
+length(octo_data$issue_mmt > 2.0)
+median(octo_data$wiki_mmt)
+typeof(octo_data$issue_mmt)
+median(octo_data$issue_mmt, na.rm = TRUE)
+median(octo_data$issue_contrib_count)
+octo_data <- na.omit(octo_data$issue_contrib_count)
+median(octo_data$issue_contrib_count)
+octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
+# below this is the analysis for the octo data
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+table(octo_data$new.age)
+octo_data$new.age.factor <- as.factor(octo_data$new.age)
+hist(octo_data$new.age)
+octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+mean(octo_data$mmt)
+hist(octo_data$mmt)
+head(octo_data)
+median(octo_data$issue_contrib_count)
+octo_data <- na.omit(octo_data)
+median(octo_data$issue_contrib_count)
+#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
+#i.e. needs to be a total contrib number that is not attached to the high level counts
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+median(octo_data$issue_mmt, na.rm = TRUE)
+length(octo_data$issue_mmt > 2.0)
+length(octo_data$issue_mmt > 2.0)
+length(octo_data$issue_mmt > 2)
+median(octo_data$issue_mmt)
+, na.rm = TRUE
+median(octo_data$issue_mmt, na.rm = TRUE)
+length(octo_data$issue_mmt > 2)
+length(octo_data$issue_mmt > 2)
+length(octo_data$issue_mmt > 2.0)
+max(octo_data$issue_mmt, na.rm = TRUE)
+octo_data$new_mmt <- (((octo_data$collaborators * 2)+ (octo_data$total_contrib - octo_data$collaborators)) / (octo_data$total_contrib))
+hist(octo_data$new_mmt)
+octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+mean(octo_data$mmt)
+hist(octo_data$mmt)
+#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$wiki_mmt)
+hist(octo_data$issue_mmt)
+length(octo_data$issue_mmt > 2.0)
+octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib)]
+octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
+octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count
+octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE)
+# below this is the analysis for the octo data
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+table(octo_data$new.age)
+octo_data$new.age.factor <- as.factor(octo_data$new.age)
+hist(octo_data$new.age)
+octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+mean(octo_data$mmt)
+hist(octo_data$mmt)
+head(octo_data)
+octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count
+octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
+octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
+octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE)
+# below this is the analysis for the octo data
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+table(octo_data$new.age)
+octo_data$new.age.factor <- as.factor(octo_data$new.age)
+hist(octo_data$new.age)
+octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
+mean(octo_data$mmt)
+hist(octo_data$mmt)
+head(octo_data)
+octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),]
+#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
+octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$issue_mmt)
+max(octo_data$issue_mmt, na.rm = TRUE)
+length(octo_data$issue_mmt > 2.0)
+issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
+summary(issue_mmtmodel1)
+wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
+summary(wiki_mmtmodel1)
+write.csv(octo_data, "new_octo.csv", row.names = FALSE)
+octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
+qqnorm(octo_data$issue_mmt)
+octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$wiki_mmt)
+median(octo_data$wiki_mmt)
+qqnorm(octo_data$wiki_mmt)
+qqnorm(octo_data$issue_mmt)
+qqnorm(octo_data$wiki_mmt)
+qqnorm(log(octo_data$issue_mmt))
+qqnorm(octo_data$issue_mmt)
+qqnorm(log(octo_data$issue_mmt))
+qqnorm(octo_data$issue_mmt)
+qqnorm(log(octo_data$issue_mmt))
+qqnorm(residuals(octo_data$issue_mmt))
+qqnorm(octo_data$issue_mmt)
+qqnorm(log(octo_data$issue_mmt))
+qqnorm(octo_data$issue_mmt)
+hist(log(octo_data$issue_mmt))
+hist(sqrt(octo_data$issue_mmt))
+#below are the models for the octo data, there should be analysis for each one
+octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
+summary(octo_mmtmodel1)
+#below are the models for the octo data, there should be analysis for each one
+octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
+summary(octo_mmtmodel1)
+# below this is the analysis for the octo data
+octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
+table(octo_data$new.age)
+octo_data$new.age.factor <- as.factor(octo_data$new.age)
+hist(octo_data$new.age)
+#below are the models for the octo data, there should be analysis for each one
+octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
+summary(octo_mmtmodel1)
+hist(sqrt(octo_data$issue_mmt))
+hist(sqrt(octo_data$issue_mmt))
+hist(octo_data$issue_mmt)
+#right skewed data, need to transform
+library(rcompanion)
+install.packages(rcompanion)
+hist(sqrt(octo_data$issue_mmt))
+qqnorm(1/octo_data$issue_mmt)
+hist(1/octo_data$issue_mmt)
+hist(log(octo_data$issue_mmt))
+hist(sqrt(octo_data$issue_mmt))
+hist(log(octo_data$issue_mmt))
+octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
+sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data)
+summary(sqrt_issue_mmtmodel1)
+summary(issue_mmtmodel1)
+octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
+hist(octo_data$wiki_mmt)
+wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
+summary(wiki_mmtmodel1)
+g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 5)
+g3
+g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05)
+g3
+g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.05) + theme_bw()
+g3
+g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
+g3
+g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
+g2
+g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
+g1
+g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
+g3
+g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
+g2
+texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
+custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
+custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
+source('powerAnalysis.R') #my little "lib"
+texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
+custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
+custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
+library(texreg) #my little "lib"
+texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
+custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones'  ),
+custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
+texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
+custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.'  ),
+custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
+use.packages=FALSE, table=FALSE, ci.force = TRUE)
--- a/R/GovRiskPower.R
+++ b/R/GovRiskPower.R
@ -44,14 +44,14 @@ octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.5
 table(octo_data$new.age)
 octo_data$new.age.factor <- as.factor(octo_data$new.age)
 hist(octo_data$new.age)
+length(which(octo_data$underproduction_low < 0))
+median(octo_data$underproduction_mean)

 octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
 mean(octo_data$mmt)
 hist(octo_data$mmt)
 head(octo_data)

-octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),]
-write.csv(octo_data, "new_octo.csv", row.names = FALSE)

 #TODO: there's an issue with calculating this but somehow not an issue with the wiki one
 octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
@ -70,6 +70,8 @@ median(octo_data$wiki_mmt)
 qqnorm(octo_data$wiki_mmt)
 #left skewed data, need to transform

+g4 <- ggplot(octo_data)
+g4

 #below are the models for the octo data, there should be analysis for each one
 octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
@ -84,6 +86,7 @@ summary(sqrt_issue_mmtmodel1)
 wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
 summary(wiki_mmtmodel1)

+
 library(texreg) #my little "lib"

 texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
--- a/R/didAnalyses.R
+++ b/R/didAnalyses.R
@ -0,0 +1,4 @@
+library(tidyverse)
+
+#set wd 
+try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
--- a/R/plots/0119-final-mmt.png
+++ b/R/plots/0119-final-mmt.png
--- a/R/plots/630_0119_final.png
+++ b/R/plots/630_0119_final.png
--- a/R/plots/cs497_final_plot.png
+++ b/R/plots/cs497_final_plot.png
--- a/R/plots/data_subset_agegroup.png
+++ b/R/plots/data_subset_agegroup.png
--- a/R/plots/final-mmt-plot.png
+++ b/R/plots/final-mmt-plot.png
--- a/R/plots/final-mmt-underprod-final-last.png
+++ b/R/plots/final-mmt-underprod-final-last.png
--- a/R/plots/formal-underprod-ggplot.png
+++ b/R/plots/formal-underprod-ggplot.png
--- a/R/plots/mess-mmt-ggplot.png
+++ b/R/plots/mess-mmt-ggplot.png
--- a/R/plots/mmt-underprod-ggplot.png
+++ b/R/plots/mmt-underprod-ggplot.png
--- a/R/plots/new_mmt_underprod_plot.png
+++ b/R/plots/new_mmt_underprod_plot.png
--- a/R/plots/newmmt-underprod-plot.png
+++ b/R/plots/newmmt-underprod-plot.png
--- a/R/plots/saner24_presentation.png
+++ b/R/plots/saner24_presentation.png
--- a/R/plots/temp-mmt-colors.png
+++ b/R/plots/temp-mmt-colors.png
--- a/R/plots/temp-temp.png
+++ b/R/plots/temp-temp.png
--- a/R/stale_scripts/EDA.R
+++ b/R/stale_scripts/EDA.R
--- a/R/stale_scripts/calculatePower.R
+++ b/R/stale_scripts/calculatePower.R
--- a/R/stale_scripts/justPowerSims.R
+++ b/R/stale_scripts/justPowerSims.R
--- a/R/stale_scripts/plotting_age.R
+++ b/R/stale_scripts/plotting_age.R
--- a/R/stale_scripts/powerAnalysis.R
+++ b/R/stale_scripts/powerAnalysis.R
--- a/R/stale_scripts/powerAnalysis.orig.R
+++ b/R/stale_scripts/powerAnalysis.orig.R