updating some octo analysis
This commit is contained in:
parent
cb5f29012d
commit
80ff60b755
998
R/.Rhistory
998
R/.Rhistory
File diff suppressed because it is too large
Load Diff
105
R/GovRiskPower.R
Normal file
105
R/GovRiskPower.R
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
rm(list=ls())
|
||||||
|
set.seed(424242)
|
||||||
|
|
||||||
|
library(readr)
|
||||||
|
library(ggplot2)
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
|
#primary analysis for cross-sectional community metrics
|
||||||
|
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
||||||
|
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||||
|
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
||||||
|
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
|
||||||
|
|
||||||
|
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||||
|
mean(overall_data$mmt)
|
||||||
|
hist(overall_data$mmt, probability = TRUE)
|
||||||
|
|
||||||
|
#age_vector <- overall_data$age_of_project/365
|
||||||
|
#quantile(age_vector)
|
||||||
|
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
|
table(overall_data$new.age)
|
||||||
|
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||||
|
hist(overall_data$new.age)
|
||||||
|
|
||||||
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data)
|
||||||
|
summary(mmtmodel1)
|
||||||
|
|
||||||
|
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||||
|
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||||
|
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) +
|
||||||
|
xlab("MMT") +
|
||||||
|
ylab("Underproduction Factor") +
|
||||||
|
theme_bw() +
|
||||||
|
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
|
||||||
|
g4
|
||||||
|
|
||||||
|
texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'MMT (Overall Dataset)'),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'),
|
||||||
|
use.packages=FALSE, table=TRUE, ci.force = TRUE)
|
||||||
|
|
||||||
|
# below this is the analysis for the octo data
|
||||||
|
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
|
table(octo_data$new.age)
|
||||||
|
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||||
|
hist(octo_data$new.age)
|
||||||
|
|
||||||
|
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||||
|
mean(octo_data$mmt)
|
||||||
|
hist(octo_data$mmt)
|
||||||
|
head(octo_data)
|
||||||
|
|
||||||
|
octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),]
|
||||||
|
write.csv(octo_data, "new_octo.csv", row.names = FALSE)
|
||||||
|
|
||||||
|
#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
|
||||||
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
|
||||||
|
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
||||||
|
g2
|
||||||
|
g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
||||||
|
g1
|
||||||
|
#right skewed data, need to transform
|
||||||
|
|
||||||
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||||
|
hist(octo_data$wiki_mmt)
|
||||||
|
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
||||||
|
g3
|
||||||
|
median(octo_data$wiki_mmt)
|
||||||
|
qqnorm(octo_data$wiki_mmt)
|
||||||
|
#left skewed data, need to transform
|
||||||
|
|
||||||
|
|
||||||
|
#below are the models for the octo data, there should be analysis for each one
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
|
||||||
|
summary(octo_mmtmodel1)
|
||||||
|
|
||||||
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
|
||||||
|
summary(issue_mmtmodel1)
|
||||||
|
|
||||||
|
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data)
|
||||||
|
summary(sqrt_issue_mmtmodel1)
|
||||||
|
|
||||||
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
|
||||||
|
summary(wiki_mmtmodel1)
|
||||||
|
|
||||||
|
library(texreg) #my little "lib"
|
||||||
|
|
||||||
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
||||||
|
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
|
||||||
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
|
||||||
|
#below here is the analysis for the readme.md data
|
||||||
|
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
|
table(readme_data$new.age)
|
||||||
|
readme_data$new.age.factor <- as.factor(readme_data$new.age)
|
||||||
|
hist(readme_data$new.age)
|
||||||
|
|
||||||
|
|
||||||
|
#below here is the analysis for the contributing.md files
|
||||||
|
contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
|
table(contributing_data$new.age)
|
||||||
|
contributing_data$new.age.factor <- as.factor(contributing_data$new.age)
|
||||||
|
hist(contributing_data$new.age)
|
@ -68,9 +68,20 @@ cor.test(data1$mmt, data1$up.fac.mean)
|
|||||||
cor.test(data1$milestones, data1$up.fac.mean)
|
cor.test(data1$milestones, data1$up.fac.mean)
|
||||||
cor.test(data1$age, data1$up.fac.mean)
|
cor.test(data1$age, data1$up.fac.mean)
|
||||||
|
|
||||||
data1$new.age.factor <- as.factor(data1$new.age)
|
data1$new.age.factor <- factor(data1$new.age, levels=c(1,2,3,4), labels=c("0-9y", "9-12y", "12-15y","15-16y"))
|
||||||
#geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange")+
|
#geom_abline(intercept=coef(mmtmodel1)[1], slope=coef(mmtmodel1)[2], colour = "orange")+
|
||||||
|
|
||||||
|
|
||||||
|
g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
|
||||||
|
geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
|
||||||
|
method='lm', formula= y~x, se=FALSE)+
|
||||||
|
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
|
||||||
|
theme_bw() +
|
||||||
|
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
||||||
|
g2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
|
g <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
|
||||||
geom_point() +
|
geom_point() +
|
||||||
#geom_smooth( method="lm", formula=(y~x), colour = "orange")+
|
#geom_smooth( method="lm", formula=(y~x), colour = "orange")+
|
||||||
|
BIN
R/cs497_final_plot.png
Normal file
BIN
R/cs497_final_plot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 96 KiB |
@ -3,11 +3,15 @@ set.seed(424242)
|
|||||||
|
|
||||||
library(readr)
|
library(readr)
|
||||||
library(ggplot2)
|
library(ggplot2)
|
||||||
|
library(tidyverse)
|
||||||
|
|
||||||
data1 <- read_csv('../kk_final_expanded_data_final.csv',show_col_types = FALSE)
|
data1 <- read_csv('../kk_final_expanded_data_final.csv',show_col_types = FALSE)
|
||||||
data2 <- read_csv('../kk_final_octo_data_total.csv',show_col_types = FALSE)
|
data2 <- read_csv('../kk_final_octo_data_total.csv',show_col_types = FALSE)
|
||||||
data3 <- read_csv('../kk_final_doclist_roster.csv',show_col_types = FALSE)
|
data3 <- read_csv('../kk_final_doclist_roster.csv',show_col_types = FALSE)
|
||||||
data4 <-read_csv('../kk_final_rosterslist.csv',show_col_types = FALSE)
|
data4 <-read_csv('../kk_final_rosterslist.csv',show_col_types = FALSE)
|
||||||
|
data5 <- read_csv('../final_data/kk_final_readme_roster.csv', show_col_types=FALSE)
|
||||||
|
data6 <-read_csv('../kk_final_commentlist.csv', show_col_types=FALSE)
|
||||||
|
data7 <- read_csv('../final_data/kk_final_octo.csv', show_col_types = FALSE)
|
||||||
#getting data subset metadata
|
#getting data subset metadata
|
||||||
|
|
||||||
head(data1)
|
head(data1)
|
||||||
@ -19,10 +23,24 @@ length(which(data2$underproduction_low < 0))
|
|||||||
mean(data2$underproduction_mean)
|
mean(data2$underproduction_mean)
|
||||||
|
|
||||||
length(which(data1$underproduction_low < 0))
|
length(which(data1$underproduction_low < 0))
|
||||||
mean(data1$underproduction_mean)
|
median(data1$underproduction_mean)
|
||||||
|
|
||||||
|
length(which(data5$underproduction_low < 0))
|
||||||
|
|
||||||
|
median(data7$underproduction_mean)
|
||||||
|
|
||||||
length(which(data3$underproduction_low < 0))
|
length(which(data3$underproduction_low < 0))
|
||||||
mean(data3$underproduction_mean)
|
median(data3$underproduction_mean)
|
||||||
|
median(data3$age_of_project / 365)
|
||||||
|
|
||||||
|
length(which(data5$underproduction_low < 0))
|
||||||
|
median(data5$underproduction_mean)
|
||||||
|
median(data5$age_of_project / 365)
|
||||||
|
|
||||||
|
length(which(data6$underproduction_low < 0))
|
||||||
|
median(data6$underproduction_mean)
|
||||||
|
median(data6$age_of_project / 365)
|
||||||
|
median(data6$contributors)
|
||||||
|
|
||||||
length(which(data4$underproduction_low < 0))
|
length(which(data4$underproduction_low < 0))
|
||||||
mean(data4$underproduction_mean)
|
mean(data4$underproduction_mean)
|
||||||
@ -136,8 +154,51 @@ g3 <- ggplot(data1, aes(x=mmt, y=underproduction_mean)) +
|
|||||||
theme_bw()
|
theme_bw()
|
||||||
g3
|
g3
|
||||||
|
|
||||||
|
|
||||||
|
head(data6)
|
||||||
|
quantile(data1$contributors + data1$collaborators,probs=c(0.025,0.975))
|
||||||
|
data6$total_community = data6$contributors + data6$collaborators
|
||||||
|
data7 <- subset(data6, total_community > 1.0 & total_community < 457.4)
|
||||||
|
g4 <- ggplot(data7, aes(x=age_of_project/365, y=total_community)) +
|
||||||
|
geom_point() +
|
||||||
|
geom_smooth(mapping = aes(x=age_of_project/365, y=total_community), color="red")+
|
||||||
|
xlab("Age of the Project (years)") +
|
||||||
|
ylab("Contributor Community Population") +
|
||||||
|
theme_bw()
|
||||||
|
g4
|
||||||
|
|
||||||
|
median(data6$total_community)
|
||||||
|
cor.test(data6$total_community, data6$age_of_project)
|
||||||
|
|
||||||
cor.test(data1$mmt, data1$new.age)
|
cor.test(data1$mmt, data1$new.age)
|
||||||
age_data <- subset(data1, !is.na(new.age))
|
age_data <- subset(data1, !is.na(new.age))
|
||||||
g2 <- ggplot(age_data, aes(x=factor(new.age), y=mmt))+
|
g2 <- ggplot(age_data, aes(x=factor(new.age), y=mmt))+
|
||||||
geom_boxplot()
|
geom_boxplot()
|
||||||
g2
|
g2
|
||||||
|
|
||||||
|
|
||||||
|
d3u <- data3[order(data3$underproduction_mean, decreasing=TRUE),]
|
||||||
|
d3hu <- d3u[1:176,]
|
||||||
|
d3mu <- d3u[177:352,]
|
||||||
|
d3lu <- d3u[353:528, ]
|
||||||
|
median(d3hu$underproduction_mean)
|
||||||
|
median(d3hu$age_of_project / 365)
|
||||||
|
|
||||||
|
max(d3mu$underproduction_mean)
|
||||||
|
median(d3mu$underproduction_mean)
|
||||||
|
median(d3mu$age_of_project / 365)
|
||||||
|
|
||||||
|
max(d3lu$underproduction_mean)
|
||||||
|
median(d3lu$underproduction_mean)
|
||||||
|
median(d3lu$age_of_project / 365)
|
||||||
|
d3lunames <- as.vector(d3lu$project_name)
|
||||||
|
|
||||||
|
|
||||||
|
data1 <- data1 %>% mutate(case_when(data1$project_name %in% data3$project_name ~ 1, .default = 0))
|
||||||
|
data1 <- data1 %>% rename(has_file = `case_when(data1$project_name %in% data3$project_name ~ 1, .default = 0)`)
|
||||||
|
cor.test(data1$has_file, data1$underproduction_mean)
|
||||||
|
|
||||||
|
data1 <- data1 %>% mutate(case_when(data1$project_name %in% data5$project_name ~ 1, .default = 0))
|
||||||
|
data1 <- data1 %>% rename(has_readme = `case_when(data1$project_name %in% data5$project_name ~ 1, .default = 0)`)
|
||||||
|
cor.test(data1$has_readme, data1$underproduction_mean)
|
||||||
|
|
||||||
|
BIN
R/saner24_presentation.png
Normal file
BIN
R/saner24_presentation.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 53 KiB |
3843
deb_octo_data.csv
Normal file
3843
deb_octo_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
final_data/.DS_Store
vendored
Normal file
BIN
final_data/.DS_Store
vendored
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
3064
new_denom_032624_stripped.csv
Normal file
3064
new_denom_032624_stripped.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user