rm(list=ls()) set.seed(424242) library(readr) library(ggplot2) library(tidyverse) #primary analysis for cross-sectional community metrics overall_data <- read_csv('../final_data/deb_full_data.csv',show_col_types = FALSE) octo_data <- read_csv('../final_data/deb_octo_data.csv', show_col_types = FALSE) readme_data <- read_csv("../final_data/deb_readme_roster.csv", show_col_types = FALSE) contributing_data <- read_csv("../final_data/deb_contribfile_roster.csv", show_col_types = FALSE) overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) mean(overall_data$mmt) hist(overall_data$mmt, probability = TRUE) #the basic stuff for the overall data overall_data$mmt <- (((overall_data$collaborators * 2)+ overall_data$contributors) / (overall_data$contributors + overall_data$collaborators)) mean(overall_data$mmt) hist(overall_data$mmt, probability = TRUE) #some new variables around age #overall_data$new.age <- as.numeric(cut(overall_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) #table(overall_data$new.age) #overall_data$new.age.factor <- as.factor(overall_data$new.age) overall_data$scaled_age <- scale(overall_data$age_of_project) #model mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age, data=overall_data) summary(mmtmodel1) qqnorm(residuals(mmtmodel1)) #clean octo data octo_data <- filter(octo_data, total_contrib != 0) # below this is the analysis for the octo data octo_data$new.age <- as.numeric(cut(octo_data$age_of_project/365, breaks=c(0,7.524197,10.323056,13.649367,17), labels=c(1,2,3,4))) table(octo_data$new.age) octo_data$new.age.factor <- as.factor(octo_data$new.age) octo_data$scaled_age <- scale(octo_data$age_of_project) octo_data$mmt <- (((octo_data$collaborators * 2)+ octo_data$contributors) / (octo_data$contributors + octo_data$collaborators)) mean(octo_data$mmt) hist(octo_data$mmt) head(octo_data) #getting the mmt-equivalent for both issue activity as well as wiki contrib activity octo_data$issue_mmt <- ((octo_data$issue_contrib_count * 2) + (octo_data$total_contrib - octo_data$issue_contrib_count)) / (octo_data$total_contrib) octo_data$sqrt_issue_mmt <- sqrt(octo_data$issue_mmt) #right skewed data, need to transform octo_data$wiki_mmt <- ((octo_data$wiki_contrib_count * 2) + (octo_data$total_contrib - octo_data$wiki_contrib_count)) / (octo_data$total_contrib) hist(octo_data$wiki_mmt) #getting some of the information in about whether projects have specific files readme_did_roster <- read_csv("../final_data/deb_readme_did.csv", show_col_types = FALSE) contrib_did_roster <- read_csv("../final_data/deb_contrib_did.csv", show_col_types = FALSE) octo_data <- octo_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) overall_data <- overall_data |> mutate(has_readme = as.numeric(upstream_vcs_link %in% readme_did_roster$upstream_vcs_link)) |> mutate(has_contrib = as.numeric(upstream_vcs_link %in% contrib_did_roster$upstream_vcs_link)) #below are the models for the octo data, there should be analysis for each one octo_mmtmodel1 <- lm(underproduction_mean ~ mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(octo_mmtmodel1) issue_mmtmodel1 <- lm(underproduction_mean ~ issue_mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(issue_mmtmodel1) qqnorm(residuals(issue_mmtmodel1)) wiki_mmtmodel1 <- lm(underproduction_mean ~ wiki_mmt + scaled_age + has_readme + has_contrib, data=octo_data) summary(wiki_mmtmodel1) qqnorm(residuals(wiki_mmtmodel1)) #these next three are looking at mmt as an outcome of other factors mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = octo_data) summary(mmt_outcome_model) all_mmt_outcome_model <- lm(mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data = overall_data) summary(all_mmt_outcome_model) govdoc_issuesmmt <- lm(issue_mmt ~ scaled_age + as.factor(has_readme) + as.factor(has_contrib), data=octo_data) summary(govdoc_issuesmmt) library(texreg) #my little "lib" texreg(list(octo_mmtmodel1, issue_mmtmodel1, wiki_mmtmodel1), stars=NULL, digits=2, custom.model.names=c( 'M1: MMT','M2: issue contrib.', 'M3: wiki contrib.' ), custom.coef.names=c('(Intercept)', 'MMT', 'scaled_age', 'has readme', 'has contrib', 'Issue MMT', 'Wiki MMT'), use.packages=FALSE, table=FALSE, ci.force = TRUE) #below here is the analysis for the readme.md data cor.test(octo_data$mmt, octo_data$has_readme) cor.test(octo_data$mmt, octo_data$has_contrib) cor.test(octo_data$has_readme, octo_data$has_contrib) #using the groupings and estimates from the ranef coefficients from D as data #pulling in the group data for the ranef coefficients rm_grouping <- read_csv('051224_readme_grouped.csv',show_col_types = FALSE) contrib_grouping <- read_csv('051224_contrib_grouped.csv', show_col_types = FALSE) rm_grouping <- rm_grouping |> rename(upstream_vcs_link = level)|> mutate(factored_group = as.factor(ranef_grouping)) contrib_grouping <- contrib_grouping |> rename(upstream_vcs_link = level) |> mutate(factored_group = as.factor(ranef_grouping)) grouped_rm <- left_join(rm_grouping, overall_data, by="upstream_vcs_link") grouped_contrib <- left_join(contrib_grouping, overall_data, by="upstream_vcs_link") rm_did <- read_csv('../final_data/deb_readme_did.csv',show_col_types = FALSE) contrib_did <- read_csv('../final_data/deb_contrib_did.csv', show_col_types = FALSE) grouped_rm <- left_join(grouped_rm, rm_did, by="upstream_vcs_link") grouped_contrib <- left_join(grouped_contrib, contrib_did, by="upstream_vcs_link") # also looking at how long each project has had a specific governance document # calculate in terms of July 6, 2020 (when underproduction metrics were collected) library(chron) dtparts = t(as.data.frame(strsplit(as.character(grouped_rm$event_date),' '))) thetimes = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s')) typeof(thetimes) how_long_has_file <- difftime(as.Date("2020-07-06"), as.Date(grouped_rm$event_date), units = "days") grouped_rm <- grouped_rm |> mutate(formatted_event_time = chron(dates=dtparts[,1],times=dtparts[,2], format=c('y-m-d','h:m:s'))) |> mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) #now doing it for the contrib_data contrib_dtparts = t(as.data.frame(strsplit(as.character(grouped_contrib$event_date),' '))) grouped_contrib <- grouped_contrib |> mutate(formatted_event_time = chron(dates=contrib_dtparts[,1],times=contrib_dtparts[,2], format=c('y-m-d','h:m:s'))) |> mutate(event_delta = difftime(as.chron("2020-07-06"), formatted_event_time, units = "days")) #test with linear model, there should be an interaction between how long the project has had a document and its grouping, no? grouping_model_rm <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_rm) summary(grouping_model_rm) qqnorm(residuals(grouping_model_rm)) grouping_model_contrib <- lm(underproduction_mean ~ event_delta*factored_group + mmt + scaled_age, data=grouped_contrib) summary(grouping_model_contrib) qqnorm(residuals(grouping_model_contrib))