diff --git a/R/.Rhistory b/R/.Rhistory index 4db6dfa..9187d92 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,439 +1,3 @@ -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -data1$new.age.factor <- factor(data1$new.age, levels=c(1,2,3,4), labels=c("0-9y", "9-12y", "12-15y","15-16y")) -g2 <- ggplot(data1, aes(x=mmt, y=up.fac.mean)) + -geom_smooth(mapping = aes(x=mmt, y=up.fac.mean, color=new.age.factor), -method='lm', formula= y~x, se=FALSE)+ -labs(x="MMT", y="Mean Underproduction Factor", color = "Project Age Group") + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g2 -library(readr) -library(ggplot2) -library(tidyverse) -data7 <- read_csv('../final_data/kk_final_octo.csv', show_col_types = FALSE) -median(data7$underproduction_mean) -length(which(data7$underproduction_low < 0)) -364 / 3843 -data5 <- read_csv('../kk_final_readme_roster.csv', show_col_types=FALSE) -data5 <- read_csv('..final_data/kk_final_readme_roster.csv', show_col_types=FALSE) -data5 <- read_csv('../final_data/kk_final_readme_roster.csv', show_col_types=FALSE) -length(which(data5$underproduction_low < 0)) -227/2695 -#primary analysis for cross-sectional community metrics -overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) -rm(list=ls()) -set.seed(424242) -library(readr) -library(ggplot2) -library(tidyverse) -#primary analysis for cross-sectional community metrics -overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) -octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) -overall_data$mmt <- (((oveall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -overall_data$mmt <- (((overall_data1$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) -mean(overall_data1$mmt) -mean(overall_data$mmt) -hist(overall_data$mmt, probability = TRUE) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -table(data1$new.age) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data1$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -hist(overall_data$new.age.factor) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age.factor) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,13,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age.factor) -hist(overall_data$new.age) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,13,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,15,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,8,11,14,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,11,14,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,14,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7,10,13,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -data1$new.age <- as.numeric(cut(data1$age_of_project/365, breaks=c(0,9,12,15,17), labels=c(1,2,3,4))) -age_vector <- overall_data$age_of_project/365 -order(age_vector) -order(age_vector) -quartile(age_vector) -quantile(age_vector) -overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(overall_data$new.age) -overall_data$new.age.factor <- as.factor(overall_data$new.age) -hist(overall_data$new.age) -1159/5105 -1391/5105 -1277/5105 -1276/510 -1276/5105 -#shows the cross-age downward slopes for all underproduction averages in the face of MMT -g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), -method='lm', formula= y~x) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() -g3 -#shows the cross-age downward slopes for all underproduction averages in the face of MMT -g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_point(mapping = aes(color=new.age.factor)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), -method='lm', formula= y~x) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() -g3 -#shows the cross-age downward slopes for all underproduction averages in the face of MMT -g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), -method='lm', formula= y~x) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() -g3 -mmtmodel1 <- lm(up.fac.mean ~ mmt + new.age.factor, data=overall_data) -mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=overall_data) -summary(mmtmodel1) -#shows the cross-age downward slopes for all underproduction averages in the face of MMT -g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), -method='lm', formula= y~x) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g3 -#shows the cross-age downward slopes for all underproduction averages in the face of MMT -g3 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), formula= y~x) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g3 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.05, 0.05), legend.justification = c("left", "bottom")) -g4 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.0, 0.0), legend.justification = c("left", "bottom")) -g4 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.0, 0.0), legend.justification = c("right", "top")) -g4 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(1.0, 1.0), legend.justification = c("right", "top")) -g4 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.9, 1.0), legend.justification = c("right", "top")) -g4 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor)) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top")) -g4 -g4 <- ggplot(overall_data, aes(x=mmt, y=underproduction_mean)) + -geom_smooth(mapping = aes(x=mmt, y=underproduction_mean, color=new.age.factor), se=FALSE) + -xlab("MMT") + -ylab("Underproduction Factor") + -theme_bw() + -theme(legend.position = c(0.9, 0.9), legend.justification = c("right", "top")) -g4 -min(overall_data$underproduction_mean) -max(overall_data$underproduction_mean) -octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(octo_data$new.age) -999 / 3842 -1139/3842 -955/3842 -747/3842 -octo_data$new.age.factor <- as.factor(octo_data$new.age) -hist(overall_data$new.age) -hist(octo_data$new.age) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(overall_data$mmt) -mean(octo_data$mmt) -hist(octo_data$mmt, probability = TRUE) -head(octo_data) -octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count) / (octo_data$api_contrib_count)) -hist(octo_data$issue_mmt, probability = TRUE) -octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count)) -hist(octo_data$issue_mmt, probability = TRUE) -octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) -summary(octo_mmtmodel1) -issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) -summary(issue_mmtmodel1) -octo_data$wiki_mmt <- (((octo_data$wiki_contrib_count * 2)+ octo_data$api_contrib_count + octo_data$wiki_contrib_count + octo_data$file_contrib_count) / (octo_data$api_contrib_count + + octo_data$wiki_contrib_count + octo_data$issue_contrib_count + octo_data$file_contrib_count)) -hist(octo_data$wiki_mmt, probability = TRUE) -wiki_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) -summary(wiki_mmtmodel1) -wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) -summary(wiki_mmtmodel1) -texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, -custom.model.names=c( 'MMT (Overall Dataset)'), -custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -source('powerAnalysis.R') #my little "lib" -texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, -custom.model.names=c( 'M1: augm. formality','M2: MMT', 'M3: milestones' ), -custom.coef.names=c('(Intercept)', 'Augmented formality', 'MMT', 'Age-2', 'Age-3', 'Age-4', 'Milestones'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -texreg(list(mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, -custom.model.names=c( 'MMT (Overall Dataset)'), -custom.coef.names=c('(Intercept)', 'MMT', 'Age-2', 'Age-3', 'Age-4'), -use.packages=FALSE, table=TRUE, ci.force = TRUE) -readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) -readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) -#below here is the analysis for the readme data -readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(readme_data$new.age) -readme_data$new.age.factor <- as.factor(readme_data$new.age) -hist(readme_data$new.age) -637 / 2694 -676 / 2694 -725 / 2694 -656 / 2694 -contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) -#below here is the analysis for the contributing.md files -readme_data$new.age <- as.numeric(cut(readme_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(readme_data$new.age) -readme_data$new.age.factor <- as.factor(readme_data$new.age) -#below here is the analysis for the contributing.md files -contributing_data$new.age <- as.numeric(cut(contributing_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(contributing_data$new.age) -contributing_data$new.age.factor <- as.factor(contributing_data$new.age) -hist(contributing_data$new.age) -76/528 -119 / 528 -171/ 528 -162 / 528 -octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) -rm(list=ls()) -set.seed(424242) -library(readr) -library(ggplot2) -library(tidyverse) -readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) -octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) -# below this is the analysis for the octo data -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) -# below this is the analysis for the octo data -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(octo_data$new.age) -octo_data$new.age.factor <- as.factor(octo_data$new.age) -hist(octo_data$new.age) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt, probability = TRUE) -head(octo_data) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- (((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib)) -hist(octo_data$issue_mmt, probability = TRUE) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt, probability = TRUE) -max(octo_data$issue_mmt) -max(octo_data$issue_mmt) -median(octo_data$issue_mmt) -median(octo_data$issue_mmt) -min(octo_data$issue_mmt) -hist(octo_data$total_contrib) -mean(octo_data$total_contrib) -median(octo_data$total_contrib) -median(octo_data$contributors) -median(octo_data$collaborators) -median(octo_data$total_contrib) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt, probability = TRUE) -hist(octo_data$issue_mmt) -octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$wiki_mmt) -min(octo_data$wiki_mmt) -median(octo_data$wiki_mmt) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -median(octo_data$total_contrib) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -max(octo_data$issue_mmt) -maximum(octo_data$issue_mmt) -octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$wiki_mmt) -median(octo_data$wiki_mmt) -#below are the models for the octo data, there should be analysis for each one -octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + new.age.factor, data=octo_data) -summary(octo_mmtmodel1) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -maximum(octo_data$issue_mmt) -typeof(octo_data$issue_mmt) -length(octo_data$issue_mmt) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -length(octo_data$issue_mmt) -sum(octo_data$issue_mmt > 2) -length(octo_data$issue_mmt > 2) -length(octo_data$issue_mmt > 2.0) -median(octo_data$wiki_mmt) -typeof(octo_data$issue_mmt) -median(octo_data$issue_mmt, na.rm = TRUE) -median(octo_data$issue_contrib_count) -octo_data <- na.omit(octo_data$issue_contrib_count) -median(octo_data$issue_contrib_count) -octo_data <- read_csv('../new_denom_032624.csv', show_col_types = FALSE) -# below this is the analysis for the octo data -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(octo_data$new.age) -octo_data$new.age.factor <- as.factor(octo_data$new.age) -hist(octo_data$new.age) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -head(octo_data) -median(octo_data$issue_contrib_count) -octo_data <- na.omit(octo_data) -median(octo_data$issue_contrib_count) -#TODO: the counts here aren't unique, need to go back and calculate so that no overlap between the counts -#i.e. needs to be a total contrib number that is not attached to the high level counts -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -median(octo_data$issue_mmt, na.rm = TRUE) -length(octo_data$issue_mmt > 2.0) -length(octo_data$issue_mmt > 2.0) -length(octo_data$issue_mmt > 2) -median(octo_data$issue_mmt) -, na.rm = TRUE -median(octo_data$issue_mmt, na.rm = TRUE) -length(octo_data$issue_mmt > 2) -length(octo_data$issue_mmt > 2) -length(octo_data$issue_mmt > 2.0) -max(octo_data$issue_mmt, na.rm = TRUE) -octo_data$new_mmt <- (((octo_data$collaborators * 2)+ (octo_data$total_contrib - octo_data$collaborators)) / (octo_data$total_contrib)) -hist(octo_data$new_mmt) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -#TODO: there's an issue with calculating this but somehow not an issue with the wiki one -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$wiki_mmt) -hist(octo_data$issue_mmt) -length(octo_data$issue_mmt > 2.0) -octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib)] -octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] -octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count -octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE) -# below this is the analysis for the octo data -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(octo_data$new.age) -octo_data$new.age.factor <- as.factor(octo_data$new.age) -hist(octo_data$new.age) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -head(octo_data) -octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),]$issue_contrib_count -octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] -octo_data[which(octo_data$issue_contrib_count > octo_data$total_contrib),] -octo_data <- read_csv('../new_denom_032624_stripped.csv', show_col_types = FALSE) -# below this is the analysis for the octo data -octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) -table(octo_data$new.age) -octo_data$new.age.factor <- as.factor(octo_data$new.age) -hist(octo_data$new.age) -octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) -mean(octo_data$mmt) -hist(octo_data$mmt) -head(octo_data) -octo_data <- octo_data[which(octo_data$issue_contrib_count <= octo_data$total_contrib),] -#TODO: there's an issue with calculating this but somehow not an issue with the wiki one -octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$issue_mmt) -max(octo_data$issue_mmt, na.rm = TRUE) -length(octo_data$issue_mmt > 2.0) -issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + new.age.factor, data=octo_data) -summary(issue_mmtmodel1) -wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + new.age.factor, data=octo_data) -summary(wiki_mmtmodel1) -write.csv(octo_data, "new_octo.csv", row.names = FALSE) -octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) -qqnorm(octo_data$issue_mmt) -octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) -hist(octo_data$wiki_mmt) -median(octo_data$wiki_mmt) -qqnorm(octo_data$wiki_mmt) -qqnorm(octo_data$issue_mmt) -qqnorm(octo_data$wiki_mmt) -qqnorm(log(octo_data$issue_mmt)) -qqnorm(octo_data$issue_mmt) -qqnorm(log(octo_data$issue_mmt)) qqnorm(octo_data$issue_mmt) qqnorm(log(octo_data$issue_mmt)) qqnorm(residuals(octo_data$issue_mmt)) @@ -510,3 +74,439 @@ texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki_contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'Issues', 'Age-2', 'Age-3', 'Age-4', 'Wiki'), use.packages=FALSE, table=FALSE, ci.force = TRUE) +glimpse(readme_df) +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +contributing_df <- read_csv("../final_data/deb_contrib_did.csv") +glimpse(readme_df) +head(readme_df) +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +glimpse(readme_df) +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "after_all_cnt", "before_mrg_cnt", "after_mrg_cnt", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +glimpse(readme_df) +#TODO: turn character type into vector of numbers +str_split(test, ", ") +test <- "[0, 0, 0, 0]" +#TODO: turn character type into vector of numbers +str_split(test, ", ") +#TODO: turn character type into vector of numbers +str_split(gsub("[][]","", test), ", ") +readme_df %>% add_column(cnt_before_all = str_split(gsub("[][]","", before_all_count), ", ")) +readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", before_all_count), ", ")) +readme_df %>% mutate("cnt_before_all" = str_split(gsub("[][]","", "before_all_count"), ", ")) +head(readme_df$before_all_cnt) +head(readme_df$cnt_before_all) +readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", "before_all_count"), ", ")) +head(readme_df$cnt_before_all) +View(readme_df) +View(readme_df) +readme_df$cnt_before_all +readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", "before_all_count"), ", ")) +str_split(gsub("[][]","", readme_df$before_all_count), ", ") +#str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +readme_df %>% mutate(cnt_before_all = str_split(gsub("[][]","", "before_all_cnt"), ", ")) +readme_df$cnt_before_all +#str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +readme_df %>% mutate("cnt_before_all" = str_split(gsub("[][]","", "before_all_cnt"), ", ")) +readme_df$cnt_before_all +str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +readme_df$cnt_before_all +readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ") +readme_df$cnt_after_all +readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ") +readme_df$cnt_before_mrg +readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ") +readme_df$cnt_after_mrg +#TODO: figure out if one needs to expand the data into a different dataframe, and if so how +readme_df <- subset(readme_df, select = -c("before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt")) +drop <- c("before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +View(readme_df) +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "after_all_cnt", "before_mrg_cnt", "after_mrg_cnt", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +glimpse(readme_df) +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +readme_df$cnt_before_all <- as.numeric(readme_df$cnt_before_all) +View(readme_df) +readme_df$cnt_before_all +type(readme_df$cnt_before_all) +typeof(readme_df$cnt_before_all) +typeof(readme_df$cnt_before_all[0]) +readme_df$cnt_before_all <- unlist(str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")) +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +typeof(readme_df$cnt_before_all) +typeof(readme_df$cnt_before_all[[0]]) +typeof(readme_df$cnt_before_all[0]) +sapply(readme_df, class) +readme_df[,lapply(readme_df, unlist)] +readme_df[,lapply(readme_df$cnt, unlist)] +readme_df[,lapply(readme_df$cnt_before_all, unlist)] +typeof(readme_df$cnt_before_all[0]) +View(readme_df) +View(readme_df) +readme_df$cnt_before_all <- as.numeric(str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")) +readme_df$cnt_before_all <- as.numeric(str_split(gsub("[][]","", readme_df$before_all_cnt), ", ")[[1]]) +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +typeof(readme_df$cnt_before_all[0]) +typeof(readme_df$cnt_before_all[0][0]) +readme_df$cnt_before_all[0] +unlist(readme_df$cnt_before_all[0]) +readme_df$cnt_before_all[0] +readme_df$cnt_before_all +test <- readme_df$cnt_before_all +test +as.numeric(test) +test[0] +test[1] +as.numeric(test[1]) +unlist(test[1]) +as.numeric(unlist(test[1])) +test2 <- as.numeric(unlist(test)) +test2 +print(entry) +for (entry in test) { +print(entry) +} +print(as.numeric(unlist(entry))) +for (entry in test) { +print(as.numeric(unlist(entry))) +} +test_two <- append(test_two, as.numeric(unlist(entry))) +print(as.numeric(unlist(entry))) +for (entry in test) { +test_two <- append(test_two, as.numeric(unlist(entry))) +print(as.numeric(unlist(entry))) +} +test_two <- c() +for (entry in test) { +test_two <- append(test_two, as.numeric(unlist(entry))) +print(as.numeric(unlist(entry))) +} +readme_df$cnt_before_all <- as.numeric(readme_df$cnt_before_all) +test_two +readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +iterator <- 0 +for (entry in test) { +readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +print(as.numeric(unlist(entry))) +iterator <- iterator + 1 +} +View(readme_df) +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_cnt", "after_all_cnt", "before_mrg_cnt", "after_mrg_cnt", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +glimpse(readme_df) +head(readme_df) +#this has to happen on the analysis side of things for a given row, it cannot happen on the storage side +#this is a conversation of whether or not the data should be saved in terms of +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +# test <- readme_df$cnt_before_all +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ") +readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ") +readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ") +drop <- c("before_all_cnt", "before_mrg_cnt", "after_all_cnt", "after_mrg_cnt") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +View(new_test) +write.csv(readme_df, "r_readme_did.csv", row.names=FALSE) +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +View(new_test) +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count")) +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") +longer +View(longer) +longer |> unnest(count) +new_longer <- longer |> unnest(count) +View(new_longer) +longer +View(new_longer) +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(as.numeric(unlist(count))) +longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count)) +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +as.numeric(unlist(count)) +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +unlist(count) +View(new_longer) +new_longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +unlist(count) |> +as.numeric(count) +View(new_longer) +new_longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +unlist(count) |> +as.numeric(count) +longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +unlist(count) +longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer +View(longer) +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +as.numeric(count) +longer +longer <- new_test |> +pivot_longer(cols = starts_with("cnt"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +#preprocessing for readme_df +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ") +readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ") +readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +#preprocessing for readme_df +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +contributing_df <- read_csv("../final_data/deb_contrib_did.csv") +#preprocessing for readme_df +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer +View(longer) +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +contributing_df <- read_csv("../final_data/deb_contrib_did.csv") +#preprocessing for readme_df +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer +View(longer) +longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window))) +library(plyr) +longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window))) +View(longer) +library(plyr) +library(tidyverse) +#set wd, read in data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +contributing_df <- read_csv("../final_data/deb_contrib_did.csv") +#preprocessing for readme_df +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window))) +View(longer) +longer <- ddply(longer, strsplit("window", split="_")[-1], transform, week=seq(from=0, by=1, length.out=length(window))) +longer <- ddply(longer, strsplit(window, split="_")[-1], transform, week=seq(from=0, by=1, length.out=length(window))) +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +add_column(rel = gsub("^.*_", "", window)) +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) |> +add_column(rel = gsub("^.*_", "", "window")) +View(longer) +longer$rel <- gsub("^.*_", "", longer$window) +View(longer) +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +new_testr$observation_type <- gsub("^.*_", "", new_test$window) +# as.numeric(unlist(test[1])) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +new_test <- head(readme_df, 1) +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +View(longer) +head(longer) +sapply(longer, class) diff --git a/R/didCleaning.R b/R/didCleaning.R index b9e5d15..2722467 100644 --- a/R/didCleaning.R +++ b/R/didCleaning.R @@ -1,5 +1,7 @@ +library(plyr) library(tidyverse) + #set wd, read in data try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") @@ -36,4 +38,5 @@ longer <- new_test |> names_to = "window", values_to = "count") |> unnest(count) -longer +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type)))