24_deb_pkg_gov/R/newAnalysis.R
2024-01-28 16:26:41 -06:00

144 lines
4.6 KiB
R

rm(list=ls())
set.seed(424242)
library(readr)
library(ggplot2)
data1 <- read_csv('../kk_final_expanded_data_final.csv',show_col_types = FALSE)
data2 <- read_csv('../kk_final_octo_data_total.csv',show_col_types = FALSE)
data3 <- read_csv('../kk_final_doclist_roster.csv',show_col_types = FALSE)
data4 <-read_csv('../kk_final_rosterslist.csv',show_col_types = FALSE)
#getting data subset metadata
head(data1)
head(data2)
head(data3)
head(data4)
length(which(data2$underproduction_low < 0))
mean(data2$underproduction_mean)
length(which(data1$underproduction_low < 0))
mean(data1$underproduction_mean)
length(which(data3$underproduction_low < 0))
mean(data3$underproduction_mean)
length(which(data4$underproduction_low < 0))
mean(data4$underproduction_mean)
data1$mmt <- (((data1$collaborators * 2)+ data1$contributors) / (data1$contributors + data1$collaborators)) - 1
mean(data1$mmt)
hist(data1$mmt, probability = TRUE)
data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data1$new.age)
data1$new.age.factor <- as.factor(data1$new.age)
hist(data1$new.age)
age1 <- c(0.39369, 0.239271, 0.2096806, 0.1573584)
d1label <- rep("Overall", length(data1$new.age.factor))
d1per <- data1$new.age
d1per[d1per==1] <- 39.37
d1per[d1per==2] <- 23.93
d1per[d1per==3] <- 20.97
d1per[d1per==4] <- 15.74
d1per.factor<- as.factor(d1per)
data5 <- (d1label)
data2$new.age <- as.numeric(cut(data2$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data2$new.age)
data2$new.age.factor <- as.factor(data2$new.age)
hist(data2$new.age)
age2 <- c(0.5675676, 0.1981982, 0.1681682, 0.06606607)
d2label <- rep("Expanded Contrib.", length(data2$new.age.factor))
d2per <- data2$new.age
d2per[d2per==1] <- 56.76
d2per[d2per==2] <- 19.82
d2per[d2per==3] <- 16.82
d2per[d2per==4] <- 06.61
d2per.factor <- as.factor(d2per)
data3$new.age <- as.numeric(cut(data3$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data3$new.age)
data3$new.age.factor <- as.factor(data3$new.age)
hist(data3$new.age)
age3 <-c(0.2556818, 0.2954545, 0.2405303, 0.2083333)
d3label <- rep("Contrib. Files", length(data3$new.age.factor))
d3per <- data3$new.age
d3per[d3per==1] <- 25.57
d3per[d3per==2] <- 29.55
d3per[d3per==3] <- 24.05
d3per[d3per==4] <- 20.83
d3per.factor <- as.factor(d3per)
data4$new.age <- as.numeric(cut(data4$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
table(data4$new.age)
data4$new.age.factor <- as.factor(data4$new.age)
hist(data4$new.age)
age4 <- c(0.5, 0.125, 0.125, 0.25)
d4label <- rep("Contrib. Rosters", length(data4$new.age.factor))
d4per <- data4$new.age
d4per[d4per==1] <- 57.14
d4per[d4per==2] <- 14.29
d4per[d4per==3] <- 14.29
d4per[d4per==4] <- 28.57
d4per.factor <- as.factor(d4per)
all_per <- c(d1per.factor, d2per.factor, d3per.factor, d4per.factor)
all_persss <- c(d1per, d2per, d3per, d4per)
all_labels <- c(d1label, d2label, d3label, d4label)
all_age_groups <- c(data1$new.age.factor, data2$new.age.factor, data3$new.age.factor, data4$new.age.factor)
d5 <- data.frame(labels = all_labels,
age_groups = all_age_groups,
per = all_per,
persss = all_persss)
d5 <- na.omit(d5)
g <- ggplot(d5, aes(fill=forcats::fct_rev(age_groups), y = 1, x=forcats::fct_rev(labels))) +
geom_bar(position="fill", stat="identity") +
scale_fill_discrete(name = "Project Age Group", labels = c("15-16y", "12-15y", "9-12y", "0-9y"), guide = guide_legend(reverse = TRUE)) +
xlab("Dataset") +
ylab("Age Grouping Percentage") +
theme_bw()+
theme(axis.text.x = element_text(angle = 0), legend.position="top")
g
sdata1$new_milestones <- as.numeric(data1$milestone_count > 0) + 1
data1$new.formal.score <- data1$mmt / (data1$new_milestones/data1$new.age)
mmtmodel1 <- lm(underproduction_mean ~ mmt + as.factor(new.age), data=data1)
summary(mmtmodel1)
agemodel1 <- lm(mmt ~ age_of_project, data=data1)
summary(agemodel1)
fsmodel2 <- lm(underproduction_mean ~ new.formal.score, data=data1)
summary(fsmodel2)
g <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) +
geom_point() +
geom_smooth(method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
g3 <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) +
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
method='lm', formula= y~x) +
xlab("MMT") +
ylab("Underproduction Factor") +
theme_bw()
g3
cor.test(data1$mmt, data1$new.age)
age_data <- subset(data1, !is.na(new.age))
g2 <- ggplot(age_data, aes(x=factor(new.age), y=mmt))+
geom_boxplot()
g2