updates to DiD data cleaning
This commit is contained in:
parent
d3547b1f91
commit
9bf6755f84
872
R/.Rhistory
872
R/.Rhistory
@ -1,439 +1,3 @@
|
||||
method='lm', formula= y~x, se=FALSE)+
|
||||
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
||||
g2
|
||||
data1$new.age.factor <- factor(data1$new.age, levels=c(1,2,3,4), labels=c("0-9y", "9-12y", "12-15y","15-16y"))
|
||||
g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor),
|
||||
method='lm', formula= y~x, se=FALSE)+
|
||||
labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
||||
g2
|
||||
library(readr)
|
||||
library(ggplot2)
|
||||
library(tidyverse)
|
||||
data7 <- read_csv('../final_data/kk_final_octo.csv', show_col_types = FALSE)
|
||||
median(data7$underproduction_mean)
|
||||
length(which(data7$underproduction_low < 0))
|
||||
364 / 3843
|
||||
data5 <- read_csv('../kk_final_readme_roster.csv', show_col_types=FALSE)
|
||||
data5 <- read_csv('..final_data/kk_final_readme_roster.csv', show_col_types=FALSE)
|
||||
data5 <- read_csv('../final_data/kk_final_readme_roster.csv', show_col_types=FALSE)
|
||||
length(which(data5$underproduction_low < 0))
|
||||
227/2695
|
||||
#primary analysis for cross-sectional community metrics
|
||||
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
||||
rm(list=ls())
|
||||
set.seed(424242)
|
||||
library(readr)
|
||||
library(ggplot2)
|
||||
library(tidyverse)
|
||||
#primary analysis for cross-sectional community metrics
|
||||
overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE)
|
||||
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||
overall_data$mmt <- (((oveall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||
overall_data$mmt <- (((overall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||
overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators))
|
||||
mean(overall_data1$mmt)
|
||||
mean(overall_data$mmt)
|
||||
hist(overall_data$mmt, probability = TRUE)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
|
||||
table(data1$new.age)
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data1$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
hist(overall_data$new.age.factor)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age.factor)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age.factor)
|
||||
hist(overall_data$new.age)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,13,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,15,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,14,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,14,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,14,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,13,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4)))
|
||||
age_vector <- overall_data$age_of_project/365
|
||||
order(age_vector)
|
||||
order(age_vector)
|
||||
quartile(age_vector)
|
||||
quantile(age_vector)
|
||||
overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(overall_data$new.age)
|
||||
overall_data$new.age.factor <- as.factor(overall_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
1159/5105
|
||||
1391/5105
|
||||
1277/5105
|
||||
1276/510
|
||||
1276/5105
|
||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
|
||||
method='lm', formula= y~x) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw()
|
||||
g3
|
||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_point(mapping = aes(color=new.age.factor)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
|
||||
method='lm', formula= y~x) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw()
|
||||
g3
|
||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
|
||||
method='lm', formula= y~x) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw()
|
||||
g3
|
||||
mmtmodel1 <- lm(up.fac.mean ~ mmt + new.age.factor, data=overall_data)
|
||||
mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data)
|
||||
summary(mmtmodel1)
|
||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor),
|
||||
method='lm', formula= y~x) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
||||
g3
|
||||
#shows the cross-age downward slopes for all underproduction averages in the face of MMT
|
||||
g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), formula= y~x) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
||||
g3
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom"))
|
||||
g4
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.0, 0.0), legend.justification = c("left", "bottom"))
|
||||
g4
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.0, 0.0), legend.justification = c("right", "top"))
|
||||
g4
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(1.0, 1.0), legend.justification = c("right", "top"))
|
||||
g4
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.9, 1.0), legend.justification = c("right", "top"))
|
||||
g4
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
|
||||
g4
|
||||
g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) +
|
||||
geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) +
|
||||
xlab("MMT") +
|
||||
ylab("Underproduction Factor") +
|
||||
theme_bw() +
|
||||
theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top"))
|
||||
g4
|
||||
min(overall_data$underproduction_mean)
|
||||
max(overall_data$underproduction_mean)
|
||||
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(octo_data$new.age)
|
||||
999 / 3842
|
||||
1139/3842
|
||||
955/3842
|
||||
747/3842
|
||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||
hist(overall_data$new.age)
|
||||
hist(octo_data$new.age)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(overall_data$mmt)
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt, probability = TRUE)
|
||||
head(octo_data)
|
||||
octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count) / (octo_data$api_contrib_count))
|
||||
hist(octo_data$issue_mmt, probability = TRUE)
|
||||
octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count))
|
||||
hist(octo_data$issue_mmt, probability = TRUE)
|
||||
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
|
||||
summary(octo_mmtmodel1)
|
||||
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
|
||||
summary(issue_mmtmodel1)
|
||||
octo_data$wiki_mmt <- (((octo_data$wiki_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count))
|
||||
hist(octo_data$wiki_mmt, probability = TRUE)
|
||||
wiki_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
|
||||
summary(wiki_mmtmodel1)
|
||||
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
|
||||
summary(wiki_mmtmodel1)
|
||||
texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||
custom.model.names=c( 'MMT (Overall Dataset)'),
|
||||
custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'),
|
||||
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||
source('powerAnalysis.R') #my little "lib"
|
||||
texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||
custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ),
|
||||
custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'),
|
||||
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||
texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2,
|
||||
custom.model.names=c( 'MMT (Overall Dataset)'),
|
||||
custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'),
|
||||
use.packages=FALSE, table=TRUE, ci.force = TRUE)
|
||||
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
||||
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
||||
#below here is the analysis for the readme data
|
||||
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(readme_data$new.age)
|
||||
readme_data$new.age.factor <- as.factor(readme_data$new.age)
|
||||
hist(readme_data$new.age)
|
||||
637 / 2694
|
||||
676 / 2694
|
||||
725 / 2694
|
||||
656 / 2694
|
||||
contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE)
|
||||
#below here is the analysis for the contributing.md files
|
||||
readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(readme_data$new.age)
|
||||
readme_data$new.age.factor <- as.factor(readme_data$new.age)
|
||||
#below here is the analysis for the contributing.md files
|
||||
contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(contributing_data$new.age)
|
||||
contributing_data$new.age.factor <- as.factor(contributing_data$new.age)
|
||||
hist(contributing_data$new.age)
|
||||
76/528
|
||||
119 / 528
|
||||
171/ 528
|
||||
162 / 528
|
||||
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
|
||||
rm(list=ls())
|
||||
set.seed(424242)
|
||||
library(readr)
|
||||
library(ggplot2)
|
||||
library(tidyverse)
|
||||
readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE)
|
||||
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
|
||||
# below this is the analysis for the octo data
|
||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
|
||||
# below this is the analysis for the octo data
|
||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(octo_data$new.age)
|
||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||
hist(octo_data$new.age)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt, probability = TRUE)
|
||||
head(octo_data)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib))
|
||||
hist(octo_data$issue_mmt, probability = TRUE)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt, probability = TRUE)
|
||||
max(octo_data$issue_mmt)
|
||||
max(octo_data$issue_mmt)
|
||||
median(octo_data$issue_mmt)
|
||||
median(octo_data$issue_mmt)
|
||||
min(octo_data$issue_mmt)
|
||||
hist(octo_data$total_contrib)
|
||||
mean(octo_data$total_contrib)
|
||||
median(octo_data$total_contrib)
|
||||
median(octo_data$contributors)
|
||||
median(octo_data$collaborators)
|
||||
median(octo_data$total_contrib)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt, probability = TRUE)
|
||||
hist(octo_data$issue_mmt)
|
||||
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$wiki_mmt)
|
||||
min(octo_data$wiki_mmt)
|
||||
median(octo_data$wiki_mmt)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt)
|
||||
median(octo_data$total_contrib)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
max(octo_data$issue_mmt)
|
||||
maximum(octo_data$issue_mmt)
|
||||
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$wiki_mmt)
|
||||
median(octo_data$wiki_mmt)
|
||||
#below are the models for the octo data, there should be analysis for each one
|
||||
octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data)
|
||||
summary(octo_mmtmodel1)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
maximum(octo_data$issue_mmt)
|
||||
typeof(octo_data$issue_mmt)
|
||||
length(octo_data$issue_mmt)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
length(octo_data$issue_mmt)
|
||||
sum(octo_data$issue_mmt > 2)
|
||||
length(octo_data$issue_mmt > 2)
|
||||
length(octo_data$issue_mmt > 2.0)
|
||||
median(octo_data$wiki_mmt)
|
||||
typeof(octo_data$issue_mmt)
|
||||
median(octo_data$issue_mmt, na.rm = TRUE)
|
||||
median(octo_data$issue_contrib_count)
|
||||
octo_data <- na.omit(octo_data$issue_contrib_count)
|
||||
median(octo_data$issue_contrib_count)
|
||||
octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE)
|
||||
# below this is the analysis for the octo data
|
||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(octo_data$new.age)
|
||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||
hist(octo_data$new.age)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt)
|
||||
head(octo_data)
|
||||
median(octo_data$issue_contrib_count)
|
||||
octo_data <- na.omit(octo_data)
|
||||
median(octo_data$issue_contrib_count)
|
||||
#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts
|
||||
#i.e. needs to be a total contrib number that is not attached to the high level counts
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
median(octo_data$issue_mmt, na.rm = TRUE)
|
||||
length(octo_data$issue_mmt > 2.0)
|
||||
length(octo_data$issue_mmt > 2.0)
|
||||
length(octo_data$issue_mmt > 2)
|
||||
median(octo_data$issue_mmt)
|
||||
, na.rm = TRUE
|
||||
median(octo_data$issue_mmt, na.rm = TRUE)
|
||||
length(octo_data$issue_mmt > 2)
|
||||
length(octo_data$issue_mmt > 2)
|
||||
length(octo_data$issue_mmt > 2.0)
|
||||
max(octo_data$issue_mmt, na.rm = TRUE)
|
||||
octo_data$new_mmt <- (((octo_data$collaborators * 2)+ (octo_data$total_contrib - octo_data$collaborators)) / (octo_data$total_contrib))
|
||||
hist(octo_data$new_mmt)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt)
|
||||
#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$wiki_mmt)
|
||||
hist(octo_data$issue_mmt)
|
||||
length(octo_data$issue_mmt > 2.0)
|
||||
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib)]
|
||||
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
|
||||
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count
|
||||
octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE)
|
||||
# below this is the analysis for the octo data
|
||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(octo_data$new.age)
|
||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||
hist(octo_data$new.age)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt)
|
||||
head(octo_data)
|
||||
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count
|
||||
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
|
||||
octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]
|
||||
octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE)
|
||||
# below this is the analysis for the octo data
|
||||
octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4)))
|
||||
table(octo_data$new.age)
|
||||
octo_data$new.age.factor <- as.factor(octo_data$new.age)
|
||||
hist(octo_data$new.age)
|
||||
octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators))
|
||||
mean(octo_data$mmt)
|
||||
hist(octo_data$mmt)
|
||||
head(octo_data)
|
||||
octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),]
|
||||
#TODO: there's an issue with calculating this but somehow not an issue with the wiki one
|
||||
octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$issue_mmt)
|
||||
max(octo_data$issue_mmt, na.rm = TRUE)
|
||||
length(octo_data$issue_mmt > 2.0)
|
||||
issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data)
|
||||
summary(issue_mmtmodel1)
|
||||
wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data)
|
||||
summary(wiki_mmtmodel1)
|
||||
write.csv(octo_data, "new_octo.csv", row.names = FALSE)
|
||||
octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE)
|
||||
qqnorm(octo_data$issue_mmt)
|
||||
octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib)
|
||||
hist(octo_data$wiki_mmt)
|
||||
median(octo_data$wiki_mmt)
|
||||
qqnorm(octo_data$wiki_mmt)
|
||||
qqnorm(octo_data$issue_mmt)
|
||||
qqnorm(octo_data$wiki_mmt)
|
||||
qqnorm(log(octo_data$issue_mmt))
|
||||
qqnorm(octo_data$issue_mmt)
|
||||
qqnorm(log(octo_data$issue_mmt))
|
||||
qqnorm(octo_data$issue_mmt)
|
||||
qqnorm(log(octo_data$issue_mmt))
|
||||
qqnorm(residuals(octo_data$issue_mmt))
|
||||
@ -510,3 +74,439 @@ texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits
|
||||
custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ),
|
||||
custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'),
|
||||
use.packages=FALSE, table=FALSE, ci.force = TRUE)
|
||||
glimpse(readme_df)
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
contributing_df <- read_csv("../final_data/deb_contrib_did.csv")
|
||||
glimpse(readme_df)
|
||||
head(readme_df)
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
glimpse(readme_df)
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "after_all_cnt", "before_mrg_cnt", "after_mrg_cnt", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
glimpse(readme_df)
|
||||
#TODO: turn character type into vector of numbers
|
||||
str_split(test, ", ")
|
||||
test <- "[0, 0, 0, 0]"
|
||||
#TODO: turn character type into vector of numbers
|
||||
str_split(test, ", ")
|
||||
#TODO: turn character type into vector of numbers
|
||||
str_split(gsub("[][]","", test), ", ")
|
||||
readme_df %>% add_column(cnt_before_all = str_split(gsub("[][]","", before_all_count), ", "))
|
||||
readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", before_all_count), ", "))
|
||||
readme_df %>% mutate("cnt_before_all" = str_split(gsub("[][]","", "before_all_count"), ", "))
|
||||
head(readme_df$before_all_cnt)
|
||||
head(readme_df$cnt_before_all)
|
||||
readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", "before_all_count"), ", "))
|
||||
head(readme_df$cnt_before_all)
|
||||
View(readme_df)
|
||||
View(readme_df)
|
||||
readme_df$cnt_before_all
|
||||
readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", "before_all_count"), ", "))
|
||||
str_split(gsub("[][]","", readme_df$before_all_count), ", ")
|
||||
#str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", "before_all_cnt"), ", "))
|
||||
readme_df$cnt_before_all
|
||||
#str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
readme_df %>% mutate("cnt_before_all" = str_split(gsub("[][]","", "before_all_cnt"), ", "))
|
||||
readme_df$cnt_before_all
|
||||
str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
readme_df$cnt_before_all
|
||||
readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ")
|
||||
readme_df$cnt_after_all
|
||||
readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ")
|
||||
readme_df$cnt_before_mrg
|
||||
readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ")
|
||||
readme_df$cnt_after_mrg
|
||||
#TODO: figure out if one needs to expand the data into a different dataframe, and if so how
|
||||
readme_df <- subset(readme_df, select = -c("before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt"))
|
||||
drop <- c("before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt")
|
||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
||||
View(readme_df)
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "after_all_cnt", "before_mrg_cnt", "after_mrg_cnt", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
glimpse(readme_df)
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
readme_df$cnt_before_all <- as.numeric(readme_df$cnt_before_all)
|
||||
View(readme_df)
|
||||
readme_df$cnt_before_all
|
||||
type(readme_df$cnt_before_all)
|
||||
typeof(readme_df$cnt_before_all)
|
||||
typeof(readme_df$cnt_before_all[0])
|
||||
readme_df$cnt_before_all <- unlist(str_split(gsub("[][]","", readme_df$before_all_cnt), ", "))
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
typeof(readme_df$cnt_before_all)
|
||||
typeof(readme_df$cnt_before_all[[0]])
|
||||
typeof(readme_df$cnt_before_all[0])
|
||||
sapply(readme_df, class)
|
||||
readme_df[,lapply(readme_df, unlist)]
|
||||
readme_df[,lapply(readme_df$cnt, unlist)]
|
||||
readme_df[,lapply(readme_df$cnt_before_all, unlist)]
|
||||
typeof(readme_df$cnt_before_all[0])
|
||||
View(readme_df)
|
||||
View(readme_df)
|
||||
readme_df$cnt_before_all <- as.numeric(str_split(gsub("[][]","", readme_df$before_all_cnt), ", "))
|
||||
readme_df$cnt_before_all <- as.numeric(str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")[[1]])
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
typeof(readme_df$cnt_before_all[0])
|
||||
typeof(readme_df$cnt_before_all[0][0])
|
||||
readme_df$cnt_before_all[0]
|
||||
unlist(readme_df$cnt_before_all[0])
|
||||
readme_df$cnt_before_all[0]
|
||||
readme_df$cnt_before_all
|
||||
test <- readme_df$cnt_before_all
|
||||
test
|
||||
as.numeric(test)
|
||||
test[0]
|
||||
test[1]
|
||||
as.numeric(test[1])
|
||||
unlist(test[1])
|
||||
as.numeric(unlist(test[1]))
|
||||
test2 <- as.numeric(unlist(test))
|
||||
test2
|
||||
print(entry)
|
||||
for (entry in test) {
|
||||
print(entry)
|
||||
}
|
||||
print(as.numeric(unlist(entry)))
|
||||
for (entry in test) {
|
||||
print(as.numeric(unlist(entry)))
|
||||
}
|
||||
test_two <- append(test_two, as.numeric(unlist(entry)))
|
||||
print(as.numeric(unlist(entry)))
|
||||
for (entry in test) {
|
||||
test_two <- append(test_two, as.numeric(unlist(entry)))
|
||||
print(as.numeric(unlist(entry)))
|
||||
}
|
||||
test_two <- c()
|
||||
for (entry in test) {
|
||||
test_two <- append(test_two, as.numeric(unlist(entry)))
|
||||
print(as.numeric(unlist(entry)))
|
||||
}
|
||||
readme_df$cnt_before_all <- as.numeric(readme_df$cnt_before_all)
|
||||
test_two
|
||||
readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
iterator <- 0
|
||||
for (entry in test) {
|
||||
readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
print(as.numeric(unlist(entry)))
|
||||
iterator <- iterator + 1
|
||||
}
|
||||
View(readme_df)
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "after_all_cnt", "before_mrg_cnt", "after_mrg_cnt", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
glimpse(readme_df)
|
||||
head(readme_df)
|
||||
#this has to happen on the analysis side of things for a given row, it cannot happen on the storage side
|
||||
#this is a conversation of whether or not the data should be saved in terms of
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
# test <- readme_df$cnt_before_all
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ")
|
||||
readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ")
|
||||
readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ")
|
||||
drop <- c("before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt")
|
||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
View(new_test)
|
||||
write.csv(readme_df, "r_readme_did.csv", row.names=FALSE)
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
View(new_test)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count"))
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count")
|
||||
longer
|
||||
View(longer)
|
||||
longer |> unnest(count)
|
||||
new_longer <- longer |> unnest(count)
|
||||
View(new_longer)
|
||||
longer
|
||||
View(new_longer)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(as.numeric(unlist(count)))
|
||||
longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count))
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
as.numeric(unlist(count))
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
unlist(count)
|
||||
View(new_longer)
|
||||
new_longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
unlist(count) |>
|
||||
as.numeric(count)
|
||||
View(new_longer)
|
||||
new_longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
unlist(count) |>
|
||||
as.numeric(count)
|
||||
longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
unlist(count)
|
||||
longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer
|
||||
View(longer)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
as.numeric(count)
|
||||
longer
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("cnt"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
#preprocessing for readme_df
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ")
|
||||
readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ")
|
||||
readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ")
|
||||
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
#preprocessing for readme_df
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
contributing_df <- read_csv("../final_data/deb_contrib_did.csv")
|
||||
#preprocessing for readme_df
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
||||
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
||||
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
||||
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
||||
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("ct"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer
|
||||
View(longer)
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
contributing_df <- read_csv("../final_data/deb_contrib_did.csv")
|
||||
#preprocessing for readme_df
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
||||
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
||||
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
||||
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
||||
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("ct"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer
|
||||
View(longer)
|
||||
longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window)))
|
||||
library(plyr)
|
||||
longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window)))
|
||||
View(longer)
|
||||
library(plyr)
|
||||
library(tidyverse)
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
contributing_df <- read_csv("../final_data/deb_contrib_did.csv")
|
||||
#preprocessing for readme_df
|
||||
colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new")
|
||||
col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new")
|
||||
readme_df <- readme_df[,col_order]
|
||||
readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ")
|
||||
readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ")
|
||||
readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ")
|
||||
readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ")
|
||||
drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct")
|
||||
readme_df = readme_df[,!(names(readme_df) %in% drop)]
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("ct"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window)))
|
||||
View(longer)
|
||||
longer <- ddply(longer, strsplit("window", split="_")[-1], transform, week=seq(from=0, by=1, length.out=length(window)))
|
||||
longer <- ddply(longer, strsplit(window, split="_")[-1], transform, week=seq(from=0, by=1, length.out=length(window)))
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("ct"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
add_column(rel = gsub("^.*_", "", window))
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("ct"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count) |>
|
||||
add_column(rel = gsub("^.*_", "", "window"))
|
||||
View(longer)
|
||||
longer$rel <- gsub("^.*_", "", longer$window)
|
||||
View(longer)
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
new_testr$observation_type <- gsub("^.*_", "", new_test$window)
|
||||
# as.numeric(unlist(test[1]))
|
||||
# test_two <- c()
|
||||
# iterator <- 0
|
||||
# for (entry in test) {
|
||||
# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry))
|
||||
# print(as.numeric(unlist(entry)))
|
||||
# iterator <- iterator + 1
|
||||
# }
|
||||
# test_two
|
||||
#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step
|
||||
new_test <- head(readme_df, 1)
|
||||
longer <- new_test |>
|
||||
pivot_longer(cols = starts_with("ct"),
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer$observation_type <- gsub("^.*_", "", longer$window)
|
||||
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
||||
View(longer)
|
||||
head(longer)
|
||||
sapply(longer, class)
|
||||
|
@ -1,5 +1,7 @@
|
||||
library(plyr)
|
||||
library(tidyverse)
|
||||
|
||||
|
||||
#set wd, read in data
|
||||
try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path)))
|
||||
readme_df <- read_csv("../final_data/deb_readme_did.csv")
|
||||
@ -36,4 +38,5 @@ longer <- new_test |>
|
||||
names_to = "window",
|
||||
values_to = "count") |>
|
||||
unnest(count)
|
||||
longer
|
||||
longer$observation_type <- gsub("^.*_", "", longer$window)
|
||||
longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))
|
||||
|
Loading…
Reference in New Issue
Block a user