diff --git a/R/.Rhistory b/R/.Rhistory index 54e4a59..adb1424 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,293 +1,3 @@ -contrib_readme_model <- load(file = "final_models/0623_pop_rm_contrib.rda") -contrib_readme_model <- load(file = "final_models/0623_pop_rm_contrib.rda") -contrib_readme_model <- load("final_models/0623_pop_rm_contrib.rda") -contrib_readme_model <- source("final_models/0623_pop_rm_contrib.rda") -contrib_readme_model <- readRDS("final_models/0623_pop_rm_contrib.rda") -contrib_readme_model <- readRDS("final_models/0623_pop_contrib_collab.rda") -collab_readme_model <- readRDS("final_models/0623_pop_rm_collab.rda") -texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, -custom.model.names=c( 'collab','contrib.' ), -custom.coef.names=c('(Intercept)', 'after_introduction'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -library(texreg) -texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, -custom.model.names=c( 'collab','contrib.' ), -custom.coef.names=c('(Intercept)', 'after_introduction'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, -custom.model.names=c( 'collab','contrib.' ), -custom.coef.names=c('(Intercept)', 'after_introduction' 'etc'), -texreg(list(collab_readme_model, contrib_readme_model), stars=NULL, digits=2, -custom.model.names=c( 'collab','contrib.' ), -custom.coef.names=c('(Intercept)', 'after_introduction', 'etc'), -use.packages=FALSE, table=FALSE, ci.force = TRUE) -library(tidyverse) -library(plyr) -library(stringr) -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -#load in data -contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") -readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") -#some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = ends_with("new"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -mutate(after_doc = as.numeric(str_detect(window, "after"))) |> -mutate(is_collab = as.numeric(str_detect(window, "collab"))) -return(longer) -} -expanded_readme_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) -} -expanded_contrib_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) -} -expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) -expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) -expanded_readme_data$logcount <- log(expanded_readme_data$count) -expanded_contrib_data$logcount <- log(expanded_contrib_data$count) -#breaking out the types of population counts -collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] -contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] -collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] -contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] -simple_collab_readme_model <- glm.nb(count ~ as.factor(after_doc), data=collab_pop_readme) -summary(simple_collab_readme_model) -anova(simple_collab_readme_model, collab_readme_model) -summary(collab_readme_model) -#load in data -full_df <- read_csv("../final_data/deb_full_data.csv") -contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") -contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") -View(contrib_df) -View(contrib_df) -readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") -readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") -# age is calculated against December 11, 2023 -contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) -View(contrib_df) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) -View(contrib_df) -View(contrib_df) -View(readme_df) -readme_df <- readme_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) -View(readme_df) -collab_readme_model_plus <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_date + (after_doc| upstream_vcs_link), data=collab_pop_readme) -#some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = ends_with("new"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -mutate(after_doc = as.numeric(str_detect(window, "after"))) |> -mutate(is_collab = as.numeric(str_detect(window, "collab"))) -return(longer) -} -expanded_readme_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) -} -expanded_contrib_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) -} -expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) -expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) -expanded_readme_data$logcount <- log(expanded_readme_data$count) -expanded_contrib_data$logcount <- log(expanded_contrib_data$count) -#breaking out the types of population counts -collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] -contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] -collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] -contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] -library(tidyverse) -library(plyr) -library(stringr) -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -#load in data -full_df <- read_csv("../final_data/deb_full_data.csv") -contrib_df <- read_csv("../final_data/deb_contrib_pop_change.csv") -View(contrib_df) -readme_df <- read_csv("../final_data/deb_readme_pop_change.csv") -contrib_df <- merge(full_df, contrib_df, by="upstream_vcs_link") -readme_df <- merge(full_df, readme_df, by="upstream_vcs_link") -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) + -mutate(event_date_days = as.Date("2024-06-24") - event_date) + -readme_df <- readme_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) + -mutate(event_date_days = as.Date("2024-06-24") - event_date) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) + -mutate(event_date_days = diff.Date(as.Date("2023-12-11"),event_date, units = "days")) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = diff.Date(as.Date("2023-12-11"),event_date, units = "days")) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = diff.Date(as.Date("2024-06-24"),event_date, units = "days")) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = diff.Date(as.Date("2024-06-24"),as.Date(event_date), units = "days")) -View(contrib_df) -View(contrib_df) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = diff.Date(as.Date("2024-06-24"),as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), units = "days")) -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = -as.numeric( -difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S") -as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), -# age is calculated against December 11, 2023 -contrib_df <- contrib_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = -as.numeric( -difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), -as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), -units = "days"))) -View(contrib_df) -readme_df <- readme_df |> -mutate(start_date = as.Date("2023-12-11") - age_of_project) |> -mutate(event_date_days = -as.numeric( -difftime(as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), -as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), -units = "days"))) -#some expansion needs to happens for each project -expand_timeseries <- function(project_row) { -longer <- project_row |> -pivot_longer(cols = ends_with("new"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -mutate(after_doc = as.numeric(str_detect(window, "after"))) |> -mutate(is_collab = as.numeric(str_detect(window, "collab"))) -return(longer) -} -expanded_readme_data <- expand_timeseries(readme_df[1,]) -for (i in 2:nrow(readme_df)){ -expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) -} -expanded_contrib_data <- expand_timeseries(contrib_df[1,]) -for (i in 2:nrow(contrib_df)){ -expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) -} -expanded_readme_data$log1pcount <- log1p(expanded_readme_data$count) -expanded_contrib_data$log1pcount <- log1p(expanded_contrib_data$count) -expanded_readme_data$logcount <- log(expanded_readme_data$count) -expanded_contrib_data$logcount <- log(expanded_contrib_data$count) -#breaking out the types of population counts -collab_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 1),] -contrib_pop_readme <- expanded_readme_data[which(expanded_readme_data$is_collab == 0),] -collab_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 1),] -contrib_pop_contrib <- expanded_contrib_data[which(expanded_contrib_data$is_collab == 0),] -#import models -library(lme4) -library(optimx) -collab_readme_model_plus <- glmer.nb(log1pcount ~ as.factor(after_doc) + event_date_days + (after_doc| upstream_vcs_link), data=collab_pop_readme) -anova(collab_readme_model_plus, collab_readme_model) -collab_readme_model <- readRDS("final_models/0623_pop_rm_collab.rda") -anova(collab_readme_model_plus, collab_readme_model) -saveRDS(collab_readme_model, "final_models/0623_pop_rm_collab_better.rda") -summary(collab_readme_model_plus) -summary(collab_readme_model) -library(tidyverse) -#things to get: -# - delete old age column -# - normal age, in date -# - age from today in days -# - delta between first commit and document in days -#README Document updates -#loading in new ages -####RDD CSV -first_commit_df <- read_csv("../062424_did_first_commit_readme.csv") -first_commit_df_2 <- read_csv("../062424_did_first_commit_readme_2.csv") -first_commit_df <- rbind(first_commit_df, first_commit_df_2) -# need to first do an rbind with this data and the second file -# check with the head of the file/size of the file -old_rdd_readme <- read_csv("../final_data/deb_readme_did.csv") -old_rdd_readme <- merge(old_rdd_readme, first_commit_df, by="upstream_vcs_link") -new_rm_data <- old_rdd_readme |> -select(-age_of_project) |> -mutate(first_commit_dt = as.POSIXct(first_commit, -format = "%a %b %d %H:%M:%S %Y %z")) |> -mutate(age_in_days = -as.numeric( -difftime( -as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), -first_commit_dt, -units = "days"))) |> -mutate (event_gap = -as.numeric( -difftime( -as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), -first_commit_dt, -units = "days"))) -View(old_rdd_readme) -new_rm_data <- old_rdd_readme |> -select(-c(age_of_project)) |> -mutate(first_commit_dt = as.POSIXct(first_commit, -format = "%a %b %d %H:%M:%S %Y %z")) |> -mutate(age_in_days = -as.numeric( -difftime( -as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), -first_commit_dt, -units = "days"))) |> -mutate (event_gap = -as.numeric( -difftime( -as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), -first_commit_dt, -units = "days"))) -new_rm_data <- old_rdd_readme |> -mutate(first_commit_dt = as.POSIXct(first_commit, -format = "%a %b %d %H:%M:%S %Y %z")) |> -mutate(age_in_days = -as.numeric( -difftime( -as.POSIXct("2024-06-24 00:00:00", format = "%Y-%m-%d %H:%M:%S"), -first_commit_dt, -units = "days"))) |> -mutate (event_gap = -as.numeric( -difftime( -as.POSIXct(event_date, format = "%Y-%m-%d %H:%M:%S"), -first_commit_dt, -units = "days"))) -View(new_rm_data) -new_rm_data <- new_rm_data |> -select(-age_of_project) -new_rm_data$age_of_project = NULL -head(new_rm_data) -write.csv(new_rm_data, file = "../final_data/deb_readme_did_updated.csv", row.names = FALSE) -old_pop_readme <- merge(old_pop_readme, first_commit_df, by="upstream_vcs_link") -####PopChange CSV -old_pop_readme <- read_csv("../final_data/deb_readme_pop_change.csv") -old_pop_readme <- merge(old_pop_readme, first_commit_df, by="upstream_vcs_link") new_pop_data <- old_pop_readme |> mutate(first_commit_dt = as.POSIXct(first_commit, format = "%a %b %d %H:%M:%S %Y %z")) |> @@ -510,3 +220,293 @@ summary(contrib_contrib_model) # optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data) all_gmodel <- readRDS("0512_contrib_all.rda") summary(all_gmodel) +#all_gmodel <- glmer.nb(log1p_count ~ D * week_offset + scaled_project_age + scaled_event_gap + (D * week_offset | upstream_vcs_link), +# control=glmerControl(optimizer="bobyqa", +# optCtrl=list(maxfun=2e5)), nAGQ=0, data=all_actions_data) +all_gmodel <- readRDS("0512_contrib_all.rda") +#identifying the quartiles of effect for D +test_condvals <- broom.mixed::tidy(all_gmodel, effects = "ran_vals", conf.int = TRUE) +test_glmer_ranef_D <- test_condvals [which(test_condvals $term == "D"),] +has_zero <- function(estimate, low, high){ +return(ifelse((low < 0),ifelse((high > 0), 1, 0), 2)) +} +test_glmer_ranef_D <- test_glmer_ranef_D |> +mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> +mutate(rank = rank(estimate)) +g <- test_glmer_ranef_D |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +theme_bw() +library(tidyverse) +g <- test_glmer_ranef_D |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +theme_bw() +g +test_glmer_ranef_D <- test_glmer_ranef_D |> +mutate(ranef_grouping = has_zero(estimate, conf.low, conf.high)) |> +mutate(rank = rank(estimate)) +g <- test_glmer_ranef_D |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +theme_bw() +g +library(tidyverse) +library(plyr) +library(gridExtra) +library(ggpubr) +# script for the analysis of document readability metrics +# readability metrics will be studied controlled by their length +# gaughan@u.northwestern.edu +# loading in the data +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R", echo=TRUE) +source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R", echo=TRUE) +source("~/Desktop/git/24_deb_gov/R/documentReadabilityAnalysis.R", echo=TRUE) +aggregate(readme_df[, 3:10], list(readme_df$subdir), median) +readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") +aggregate(readme_df[, 3:10], list(readme_df$subdir), median) +aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) +aggregate(readme_df[, 3:10], list(readme_df$subdir), median) +readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") +contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv") +#getting basic stats for the readme readability +median(readme_df$flesch_reading_ease) +median(readme_df$linsear_write_formula) +readme_rdd <- readRDS("final_models/0624_readme_all_rdd.rda") +contrib_rdd <- readRDS("final_models/0624_contrib_all_rdd.rda") +textreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, table=FALSE, ci.force = TRUE)) +textreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, table=FALSE, ci.force = TRUE) +reg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, table=FALSE, ci.force = TRUE) +texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, table=FALSE, ci.force = TRUE) +library(texreg) +texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, table=FALSE, ci.force = TRUE) +summary(readme) +summary(readme_rdd) +texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, +custom.model.names=c( 'README','CONTRIBUTING'), +custom.coef.names=c('(Intercept)', 'Indtroduction', 'Week (Time)', 'Project Age', 'Introduction:Week', 'Event Gap'), +table=FALSE, ci.force = TRUE) +readme_groupings <- read.csv('../final_data/deb_readme_interaction_groupings.csv') +contrib_groupings <- read.csv('../final_data/deb_contrib_interaction_groupings.csv') +View(readme_groupings) +library(tidyverse) +readme_g <- readme_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +theme_bw() +readme_g +subdirColors <- +setNames( c('firebrick1', 'forestgreen', 'cornflowerblue') +, c(0,1,2) ) +readme_g <- readme_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors) + +theme_bw() +readme_g +contrib_groupings <- read.csv('../final_data/deb_contrib_interaction_groupings.csv') +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors) + +theme_bw() +contrib_g +grid.arrange(readme_g, contrib_g, nrow = 1) +library(gridExtra) +grid.arrange(readme_g, contrib_g, nrow = 1) +readme_g <- readme_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors) + +guides(fill="none", color="none")+ +theme_bw() +readme_g +grid.arrange(readme_g, contrib_g, nrow = 1) +grid.arrange(contrib_g, readme_g, nrow = 1) +contrib_g <- contrib_groupings |> +ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + +geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + +scale_color_manual(values = subdirColors) + +theme_bw() + +theme(legend.position = "top") +grid.arrange(contrib_g, readme_g, nrow = 1) +library(jtools) +plot_summs(readme_rdd, contrib_rdd) +?plot_summs +plot_summs(readme_rdd, contrib_rdd, plot.distributions = TRUE) +col_order <- c("upstream_vcs_link", "age_in_days", "first_commit", "first_commit_dt", "event_gap", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +contrib_df <- contrib_df[,col_order] +contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") +contrib_df <- read_csv('../final_data/deb_contrib_did.csv') +col_order <- c("upstream_vcs_link", "age_in_days", "first_commit", "first_commit_dt", "event_gap", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +contrib_df <- contrib_df[,col_order] +contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") +contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ") +contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ") +contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +contrib_df = contrib_df[,!(names(contrib_df) %in% drop)] +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer[which(longer$observation_type == "all"),] +return(longer) +} +expanded_data <- expand_timeseries(contrib_df[1,]) +library(plyr) +contrib_df <- read_csv('../final_data/deb_contrib_did.csv') +col_order <- c("upstream_vcs_link", "age_in_days", "first_commit", "first_commit_dt", "event_gap", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +contrib_df <- contrib_df[,col_order] +contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") +contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ") +contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ") +contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +contrib_df = contrib_df[,!(names(contrib_df) %in% drop)] +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer[which(longer$observation_type == "all"),] +return(longer) +} +expanded_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ +expanded_data <- rbind(expanded_data, expand_timeseries(contrib_df[i,])) +} +View(expand_timeseries) +View(expanded_data) +window_num <- 8 +windowed_data <- expanded_data |> +filter(week >= (27 - window_num) & week <= (27 + window_num)) |> +mutate(D = ifelse(week > 27, 1, 0)) +windowed_data$week_offset <- windowed_data$week - 27 +View(windowed_data) +time_plot <- expanded_data |> +ggplot(aes(x=week_offset, y=count)) +time_plot +time_plot <- windowed_data |> +ggplot(aes(x=week_offset, y=count)) +time_plot +time_plot <- windowed_data |> +ggplot(aes(x=week_offset, y=count)) + +geom_point() +time_plot +time_plot <- windowed_data |> +ggplot(aes(x=week_offset, y=median(count))) + +geom_point() +time_plot +time_plot <- windowed_data |> +ggplot(aes(x=week_offset, y=mean(count))) + +geom_point() +time_plot +time_plot <- windowed_data |> +ggplot(aes(x=week_offset, y=count)) + +geom_point() +time_plot +all_actions_data <- windowed_data[which(windowed_data$observation_type == "all"),] +all_actions_data$log1p_count <- log1p(all_actions_data$count) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count)) + +geom_point() +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count)) + +geom_smooth()+ +geom_point() +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count)) + +geom_smooth() +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count)) + +geom_smooth() + +theme_bw() +time_plot +windowed_readme_data$week_offset <- windowed_readme_data$week - 27 +all_actions_readme_data <- windowed_readme_data[which(windowed_readme_data$observation_type == "all"),] +source("~/Desktop/git/24_deb_gov/R/gam_plot_documents.R") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +theme_bw() +time_plot +View(expanded_readme_data) +mean(all_actions_readme_data$event_gap) +mean(median$event_gap) +median(all_actions_readme_data$event_gap) +mean(all_actions_readme_data$event_gap) +mean(all_actions_contrib_data$event_gap) +median(all_actions_contrib_data$event_gap) +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +geom_vline(x=0) +theme_bw() + +theme(legend.position = "top") +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +geom_vline(x=0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +geom_vline(x=0)+ +theme_bw() + +theme(legend.position = "top") +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +geom_vline(0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +time_plot <- all_actions_data |> +ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + +geom_smooth() + +geom_vline(xintercept = 0)+ +theme_bw() + +theme(legend.position = "top") +time_plot +#looking at event gap +document_event_gap <- ggplot(all_actions_data, aes(x=event_gap, group=as.factor(document_type))) + +geom_density(aes(color = as.factor(document_type), fill=as.factor(document_type)), alpha=0.2, position="identity") + +theme_bw() +document_event_gap +#looking at event gap +document_event_gap <- ggplot(all_actions_data, aes(x=scale(event_gap), group=as.factor(document_type))) + +geom_density(aes(color = as.factor(document_type), fill=as.factor(document_type)), alpha=0.2, position="identity") + +theme_bw() +document_event_gap +#looking at event gap +mean(all_actions_readme_data$event_gap) +sd(all_actions_readme_data$event_gap) +mean(all_actions_contrib_data$event_gap) +sd(all_actions_contrib_data$event_gap) +mode(all_actions_contrib_data$event_gap) +mean(all_actions_contrib_data$event_gap) diff --git a/R/d_working_readability_plot.png b/R/d_working_readability_plot.png new file mode 100644 index 0000000..b5f973d Binary files /dev/null and b/R/d_working_readability_plot.png differ diff --git a/R/documentReadabilityAnalysis.R b/R/documentReadabilityAnalysis.R index bee6c23..47622f9 100644 --- a/R/documentReadabilityAnalysis.R +++ b/R/documentReadabilityAnalysis.R @@ -7,10 +7,11 @@ library(ggpubr) # gaughan@u.northwestern.edu # loading in the data try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../text_analysis/draft_readability_readme.csv") -contributing_df <- read_csv("../text_analysis/draft_readability_contributing.csv") +readme_df <- read_csv("../text_analysis/dwo_readability_readme.csv") +contributing_df <- read_csv("../text_analysis/dwo_readability_contributing.csv") head(readme_df) aggregate(readme_df[, 3:10], list(readme_df$subdir), median) +aggregate(contributing_df[, 3:10], list(contributing_df$subdir), median) #getting basic stats for the readme readability median(readme_df$flesch_reading_ease) median(readme_df$linsear_write_formula) diff --git a/R/draft_Rdd_groupings_plot.png b/R/draft_Rdd_groupings_plot.png new file mode 100644 index 0000000..52ab7ad Binary files /dev/null and b/R/draft_Rdd_groupings_plot.png differ diff --git a/R/draft_average_over_time_plot.png b/R/draft_average_over_time_plot.png new file mode 100644 index 0000000..f4a051a Binary files /dev/null and b/R/draft_average_over_time_plot.png differ diff --git a/R/draft_averages_plot.png b/R/draft_averages_plot.png new file mode 100644 index 0000000..66f94ba Binary files /dev/null and b/R/draft_averages_plot.png differ diff --git a/R/draft_rdd_ranef_plot.png b/R/draft_rdd_ranef_plot.png new file mode 100644 index 0000000..52ab7ad Binary files /dev/null and b/R/draft_rdd_ranef_plot.png differ diff --git a/R/dwo_working_readability.png b/R/dwo_working_readability.png new file mode 100644 index 0000000..41aa09c Binary files /dev/null and b/R/dwo_working_readability.png differ diff --git a/R/gam_plot_documents.R b/R/gam_plot_documents.R new file mode 100644 index 0000000..380b2d5 --- /dev/null +++ b/R/gam_plot_documents.R @@ -0,0 +1,70 @@ +#trying to make a time plot showing the over-time shift +library(plyr) +contrib_df <- read_csv('../final_data/deb_contrib_did.csv') +readme_df <- read_csv("../final_data/deb_readme_did.csv") +col_order <- c("upstream_vcs_link", "age_in_days", "first_commit", "first_commit_dt", "event_gap", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +#first contrib +contrib_df <- contrib_df[,col_order] +contrib_df$ct_before_all <- str_split(gsub("[][]","", contrib_df$before_all_ct), ", ") +contrib_df$ct_after_all <- str_split(gsub("[][]","", contrib_df$after_all_ct), ", ") +contrib_df$ct_before_mrg <- str_split(gsub("[][]","", contrib_df$before_mrg_ct), ", ") +contrib_df$ct_after_mrg <- str_split(gsub("[][]","", contrib_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +contrib_df = contrib_df[,!(names(contrib_df) %in% drop)] +#then readme +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { + longer <- project_row |> + pivot_longer(cols = starts_with("ct"), + names_to = "window", + values_to = "count") |> + unnest(count) + longer$observation_type <- gsub("^.*_", "", longer$window) + longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) + longer$count <- as.numeric(longer$count) + #longer <- longer[which(longer$observation_type == "all"),] + return(longer) +} +expanded_contrib_data <- expand_timeseries(contrib_df[1,]) +for (i in 2:nrow(contrib_df)){ + expanded_contrib_data <- rbind(expanded_contrib_data, expand_timeseries(contrib_df[i,])) +} +expanded_readme_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ + expanded_readme_data <- rbind(expanded_readme_data, expand_timeseries(readme_df[i,])) +} +window_num <- 8 +windowed_contrib_data <- expanded_contrib_data |> + filter(week >= (27 - window_num) & week <= (27 + window_num)) |> + mutate(D = ifelse(week > 27, 1, 0)) +windowed_readme_data <- expanded_readme_data |> + filter(week >= (27 - window_num) & week <= (27 + window_num)) |> + mutate(D = ifelse(week > 27, 1, 0)) + +windowed_contrib_data$week_offset <- windowed_contrib_data$week - 27 +all_actions_contrib_data <- windowed_contrib_data[which(windowed_contrib_data$observation_type == "all"),] +all_actions_contrib_data$document_type <- rep("contributing", length(all_actions_contrib_data$count)) +windowed_readme_data$week_offset <- windowed_readme_data$week - 27 +all_actions_readme_data <- windowed_readme_data[which(windowed_readme_data$observation_type == "all"),] +all_actions_readme_data$document_type <- rep("readme", length(all_actions_readme_data$count)) +all_actions_data <- rbind(all_actions_contrib_data, all_actions_readme_data) +all_actions_data$log1p_count <- log1p(all_actions_data$count) +time_plot <- all_actions_data |> + ggplot(aes(x=week_offset, y=log1p_count, color=factor(document_type))) + + geom_smooth() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot +#looking at event gap +mean(all_actions_readme_data$event_gap) +sd(all_actions_readme_data$event_gap) +mean(all_actions_contrib_data$event_gap) +sd(all_actions_contrib_data$event_gap) diff --git a/R/model_presentation.R b/R/model_presentation.R new file mode 100644 index 0000000..bdb7887 --- /dev/null +++ b/R/model_presentation.R @@ -0,0 +1,42 @@ +library(tidyverse) +library(texreg) + +readme_rdd <- readRDS("final_models/0624_readme_all_rdd.rda") +contrib_rdd <- readRDS("final_models/0624_contrib_all_rdd.rda") +summary(readme_rdd) + +texreg(list(readme_rdd, contrib_rdd), stars=NULL, digits=3, use.packages=FALSE, + custom.model.names=c( 'README','CONTRIBUTING'), + custom.coef.names=c('(Intercept)', 'Indtroduction', 'Week (Time)', 'Project Age', 'Introduction:Week', 'Event Gap'), + table=FALSE, ci.force = TRUE) + +readme_groupings <- read.csv('../final_data/deb_readme_interaction_groupings.csv') +contrib_groupings <- read.csv('../final_data/deb_contrib_interaction_groupings.csv') + +subdirColors <- + setNames( c('firebrick1', 'forestgreen', 'cornflowerblue') + , c(0,1,2) ) + +readme_g <- readme_groupings |> + ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + + scale_color_manual(values = subdirColors) + + guides(fill="none", color="none")+ + theme_bw() +readme_g + + +contrib_g <- contrib_groupings |> + ggplot(aes(x=rank, y=estimate, col = as.factor(ranef_grouping))) + + geom_linerange(aes(ymin= conf.low, ymax= conf.high)) + + scale_color_manual(values = subdirColors) + + theme_bw() + + theme(legend.position = "top") +contrib_g + +library(gridExtra) +grid.arrange(contrib_g, readme_g, nrow = 1) +library(jtools) +plot_summs(readme_rdd, contrib_rdd, plot.distributions = TRUE) +?plot_summs +