updates crossectional analyses
This commit is contained in:
parent
34fdc67359
commit
de600e63a3
@ -20,10 +20,11 @@ hist(overall_data$mmt, probability = TRUE)
|
|||||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
table(overall_data$new.age)
|
table(overall_data$new.age)
|
||||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||||
hist(overall_data$new.age)
|
overall_data$scaled_age <- scale(overall_data$age_of_project)
|
||||||
|
|
||||||
mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data)
|
mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data)
|
||||||
summary(mmtmodel1)
|
summary(mmtmodel1)
|
||||||
|
qqnorm(residuals(mmtmodel1))
|
||||||
|
|
||||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||||
@ -33,17 +34,14 @@ g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
|||||||
theme_bw() +
|
theme_bw() +
|
||||||
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
|
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
|
||||||
g4
|
g4
|
||||||
|
#clean octo data
|
||||||
texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
octo_data <- filter(octo_data, total_contrib != 0)
|
||||||
custom.model.names=c( 'MMT (Overall Dataset)'),
|
|
||||||
custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'),
|
|
||||||
use.packages=FALSE, table=TRUE, ci.force = TRUE)
|
|
||||||
|
|
||||||
# below this is the analysis for the octo data
|
# below this is the analysis for the octo data
|
||||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||||
table(octo_data$new.age)
|
table(octo_data$new.age)
|
||||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||||
hist(octo_data$new.age)
|
octo_data$scaled_age <- scale(octo_data$age_of_project)
|
||||||
|
|
||||||
length(which(octo_data$underproduction_low < 0))
|
length(which(octo_data$underproduction_low < 0))
|
||||||
median(octo_data$underproduction_mean)
|
median(octo_data$underproduction_mean)
|
||||||
|
|
||||||
@ -53,7 +51,6 @@ hist(octo_data$mmt)
|
|||||||
head(octo_data)
|
head(octo_data)
|
||||||
|
|
||||||
|
|
||||||
#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
|
|
||||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||||
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
|
octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt)
|
||||||
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
g2 <- ggplot(octo_data, aes(issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
||||||
@ -61,7 +58,6 @@ g2
|
|||||||
g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
g1 <- ggplot(octo_data, aes(sqrt_issue_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
||||||
g1
|
g1
|
||||||
#right skewed data, need to transform
|
#right skewed data, need to transform
|
||||||
|
|
||||||
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||||
hist(octo_data$wiki_mmt)
|
hist(octo_data$wiki_mmt)
|
||||||
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
g3 <- ggplot(octo_data, aes(wiki_mmt)) + geom_histogram(binwidth = 0.01) + theme_bw()
|
||||||
@ -69,40 +65,37 @@ g3
|
|||||||
median(octo_data$wiki_mmt)
|
median(octo_data$wiki_mmt)
|
||||||
qqnorm(octo_data$wiki_mmt)
|
qqnorm(octo_data$wiki_mmt)
|
||||||
#left skewed data, need to transform
|
#left skewed data, need to transform
|
||||||
|
typeof(octo_data$wiki_contrib_count)
|
||||||
g4 <- ggplot(octo_data)
|
sum(octo_data$total_contrib == 0)
|
||||||
g4
|
|
||||||
|
|
||||||
#below are the models for the octo data, there should be analysis for each one
|
#below are the models for the octo data, there should be analysis for each one
|
||||||
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
|
|
||||||
|
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
summary(octo_mmtmodel1)
|
summary(octo_mmtmodel1)
|
||||||
|
|
||||||
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
|
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
summary(issue_mmtmodel1)
|
summary(issue_mmtmodel1)
|
||||||
|
qqnorm(residuals(issue_mmtmodel1))
|
||||||
|
|
||||||
sqrt_issue_mmtmodel1 <- lm(underproduction_mean ~ sqrt_issue_mmt + new.age.factor, data=octo_data)
|
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data)
|
||||||
summary(sqrt_issue_mmtmodel1)
|
|
||||||
|
|
||||||
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
|
|
||||||
summary(wiki_mmtmodel1)
|
summary(wiki_mmtmodel1)
|
||||||
|
qqnorm(residuals(wiki_mmtmodel1))
|
||||||
|
|
||||||
library(texreg) #my little "lib"
|
library(texreg) #my little "lib"
|
||||||
|
|
||||||
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||||
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ),
|
||||||
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
|
custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'),
|
||||||
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||||
|
#find the overlap between projects with octo data and projects with readmes or contributings
|
||||||
|
readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE)
|
||||||
|
contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE)
|
||||||
|
octo_data <- octo_data |>
|
||||||
|
mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |>
|
||||||
|
mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link))
|
||||||
#below here is the analysis for the readme.md data
|
#below here is the analysis for the readme.md data
|
||||||
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
cor.test(octo_data$mmt, octo_data$has_readme)
|
||||||
table(readme_data$new.age)
|
cor.test(octo_data$mmt, octo_data$has_contrib)
|
||||||
readme_data$new.age.factor <- as.factor(readme_data$new.age)
|
cor.test(octo_data$has_readme, octo_data$has_contrib)
|
||||||
hist(readme_data$new.age)
|
issues_expansion <- lm(issue_mmt ~ has_contrib + scaled_age, data=octo_data)
|
||||||
|
summary(issues_expansion)
|
||||||
|
|
||||||
#below here is the analysis for the contributing.md files
|
|
||||||
contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
|
||||||
table(contributing_data$new.age)
|
|
||||||
contributing_data$new.age.factor <- as.factor(contributing_data$new.age)
|
|
||||||
hist(contributing_data$new.age)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user