rm(list=ls()) set.seed(424242) library(readr) library(ggplot2) library(tidyverse) data1 <- read_csv('../kk_final_expanded_data_final.csv',show_col_types = FALSE) data2 <- read_csv('../kk_final_octo_data_total.csv',show_col_types = FALSE) data3 <- read_csv('../kk_final_doclist_roster.csv',show_col_types = FALSE) data4 <-read_csv('../kk_final_rosterslist.csv',show_col_types = FALSE) data5 <- read_csv('../final_data/kk_final_readme_roster.csv', show_col_types=FALSE) data6 <-read_csv('../kk_final_commentlist.csv', show_col_types=FALSE) data7 <- read_csv('../final_data/kk_final_octo.csv', show_col_types = FALSE) #getting data subset metadata head(data1) head(data2) head(data3) head(data4) length(which(data2$underproduction_low < 0)) mean(data2$underproduction_mean) length(which(data1$underproduction_low < 0)) median(data1$underproduction_mean) length(which(data5$underproduction_low < 0)) median(data7$underproduction_mean) length(which(data3$underproduction_low < 0)) median(data3$underproduction_mean) median(data3$age_of_project / 365) length(which(data5$underproduction_low < 0)) median(data5$underproduction_mean) median(data5$age_of_project / 365) length(which(data6$underproduction_low < 0)) median(data6$underproduction_mean) median(data6$age_of_project / 365) median(data6$contributors) length(which(data4$underproduction_low < 0)) mean(data4$underproduction_mean) data1$mmt <- (((data1$collaborators * 2)+ data1$contributors) / (data1$contributors + data1$collaborators)) - 1 mean(data1$mmt) hist(data1$mmt, probability = TRUE) data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) table(data1$new.age) data1$new.age.factor <- as.factor(data1$new.age) hist(data1$new.age) age1 <- c(0.39369, 0.239271, 0.2096806, 0.1573584) d1label <- rep("Overall", length(data1$new.age.factor)) d1per <- data1$new.age d1per[d1per==1] <- 39.37 d1per[d1per==2] <- 23.93 d1per[d1per==3] <- 20.97 d1per[d1per==4] <- 15.74 d1per.factor<- as.factor(d1per) data5 <- (d1label) data2$new.age <- as.numeric(cut(data2$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) table(data2$new.age) data2$new.age.factor <- as.factor(data2$new.age) hist(data2$new.age) age2 <- c(0.5675676, 0.1981982, 0.1681682, 0.06606607) d2label <- rep("Expanded Contrib.", length(data2$new.age.factor)) d2per <- data2$new.age d2per[d2per==1] <- 56.76 d2per[d2per==2] <- 19.82 d2per[d2per==3] <- 16.82 d2per[d2per==4] <- 06.61 d2per.factor <- as.factor(d2per) data3$new.age <- as.numeric(cut(data3$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) table(data3$new.age) data3$new.age.factor <- as.factor(data3$new.age) hist(data3$new.age) age3 <-c(0.2556818, 0.2954545, 0.2405303, 0.2083333) d3label <- rep("Contrib. Files", length(data3$new.age.factor)) d3per <- data3$new.age d3per[d3per==1] <- 25.57 d3per[d3per==2] <- 29.55 d3per[d3per==3] <- 24.05 d3per[d3per==4] <- 20.83 d3per.factor <- as.factor(d3per) data4$new.age <- as.numeric(cut(data4$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) table(data4$new.age) data4$new.age.factor <- as.factor(data4$new.age) hist(data4$new.age) age4 <- c(0.5, 0.125, 0.125, 0.25) d4label <- rep("Contrib. Rosters", length(data4$new.age.factor)) d4per <- data4$new.age d4per[d4per==1] <- 57.14 d4per[d4per==2] <- 14.29 d4per[d4per==3] <- 14.29 d4per[d4per==4] <- 28.57 d4per.factor <- as.factor(d4per) all_per <- c(d1per.factor, d2per.factor, d3per.factor, d4per.factor) all_persss <- c(d1per, d2per, d3per, d4per) all_labels <- c(d1label, d2label, d3label, d4label) all_age_groups <- c(data1$new.age.factor, data2$new.age.factor, data3$new.age.factor, data4$new.age.factor) d5 <- data.frame(labels = all_labels, age_groups = all_age_groups, per = all_per, persss = all_persss) d5 <- na.omit(d5) g <- ggplot(d5, aes(fill=forcats::fct_rev(age_groups), y = 1, x=forcats::fct_rev(labels))) + geom_bar(position="fill", stat="identity") + scale_fill_discrete(name = "Project Age Group", labels = c("15-16y", "12-15y", "9-12y", "0-9y"), guide = guide_legend(reverse = TRUE)) + xlab("Dataset") + ylab("Age Grouping Percentage") + theme_bw()+ theme(axis.text.x = element_text(angle = 0), legend.position="top") g sdata1$new_milestones <- as.numeric(data1$milestone_count > 0) + 1 data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age) mmtmodel1 <- lm(underproduction_mean ~ mmt + as.factor(new.age), data=data1) summary(mmtmodel1) agemodel1 <- lm(mmt ~ age_of_project, data=data1) summary(agemodel1) fsmodel2 <- lm(underproduction_mean ~ new.formal.score, data=data1) summary(fsmodel2) g <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) + geom_point() + geom_smooth(method='lm', formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() g #shows the cross-age downward slopes for all underproduction averages in the face of MMT g3 <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) + geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), method='lm', formula= y~x) + xlab("MMT") + ylab("Underproduction Factor") + theme_bw() g3 head(data6) quantile(data1$contributors + data1$collaborators,probs=c(0.025,0.975)) data6$total_community = data6$contributors + data6$collaborators data7 <- subset(data6, total_community > 1.0 & total_community < 457.4) g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) + geom_point() + geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="red")+ xlab("Age of the Project (years)") + ylab("Contributor Community Population") + theme_bw() g4 median(data6$total_community) cor.test(data6$total_community, data6$age_of_project) cor.test(data1$mmt, data1$new.age) age_data <- subset(data1, !is.na(new.age)) g2 <- ggplot(age_data, aes(x=factor(new.age), y=mmt))+ geom_boxplot() g2 d3u <- data3[order(data3$underproduction_mean, decreasing=TRUE),] d3hu <- d3u[1:176,] d3mu <- d3u[177:352,] d3lu <- d3u[353:528, ] median(d3hu$underproduction_mean) median(d3hu$age_of_project / 365) max(d3mu$underproduction_mean) median(d3mu$underproduction_mean) median(d3mu$age_of_project / 365) max(d3lu$underproduction_mean) median(d3lu$underproduction_mean) median(d3lu$age_of_project / 365) d3lunames <- as.vector(d3lu$project_name) data1 <- data1 %>% mutate(case_when(data1$project_name %in% data3$project_name ~ 1, .default = 0)) data1 <- data1 %>% rename(has_file = `case_when(data1$project_name %in% data3$project_name ~ 1, .default = 0)`) cor.test(data1$has_file, data1$underproduction_mean) data1 <- data1 %>% mutate(case_when(data1$project_name %in% data5$project_name ~ 1, .default = 0)) data1 <- data1 %>% rename(has_readme = `case_when(data1$project_name %in% data5$project_name ~ 1, .default = 0)`) cor.test(data1$has_readme, data1$underproduction_mean)