window_num <- 10 longer <- longer %>% filter(week >= (26 - window_num) & week <= (26 + window_num)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = FALSE) + geom_vline(xintercept = 26) # test_two <- c() # iterator <- 0 # for (entry in test) { # readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) # print(as.numeric(unlist(entry))) # iterator <- iterator + 1 # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step # https://rpubs.com/phle/r_tutorial_regression_discontinuity_design new_test <- readme_df[697,] longer <- new_test |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) window_num <- 27 longer <- longer %>% filter(week >= (26 - window_num) & week <= (26 + window_num)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = FALSE) + geom_vline(xintercept = 26) window_num <- 13 longer <- longer %>% filter(week >= (26 - window_num) & week <= (26 + window_num)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = FALSE) + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = TRUE) + geom_vline(xintercept = 26) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 25.5) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = TRUE) + geom_vline(xintercept = 25.5) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = TRUE) + geom_vline(xintercept = 26) library(rdd-package) library(rdd) library(rdd) # test_two <- c() # iterator <- 0 # for (entry in test) { # readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) # print(as.numeric(unlist(entry))) # iterator <- iterator + 1 # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step # https://rpubs.com/phle/r_tutorial_regression_discontinuity_design new_test <- readme_df[697,] longer <- new_test |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer %>% # filter(week >= (26 - window_num) & week <= (26 + window_num)) IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = TRUE) + geom_vline(xintercept = 26) # test_two <- c() # iterator <- 0 # for (entry in test) { # readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) # print(as.numeric(unlist(entry))) # iterator <- iterator + 1 # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step # https://rpubs.com/phle/r_tutorial_regression_discontinuity_design new_test <- readme_df[0,] longer <- new_test |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer %>% # filter(week >= (26 - window_num) & week <= (26 + window_num)) IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") # test_two <- c() # iterator <- 0 # for (entry in test) { # readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) # print(as.numeric(unlist(entry))) # iterator <- iterator + 1 # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step # https://rpubs.com/phle/r_tutorial_regression_discontinuity_design new_test <- readme_df[3,] longer <- new_test |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer %>% # filter(week >= (26 - window_num) & week <= (26 + window_num)) IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") # test_two <- c() # iterator <- 0 # for (entry in test) { # readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) # print(as.numeric(unlist(entry))) # iterator <- iterator + 1 # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step # https://rpubs.com/phle/r_tutorial_regression_discontinuity_design new_test <- readme_df[9,] longer <- new_test |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer %>% # filter(week >= (26 - window_num) & week <= (26 + window_num)) IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") get_optimal_window <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") return(optimal_bandwidth) } bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidths <- c(bandwidths, get_optimal_window(readme_df[i,])) } bandwidths mean(bandwidths) median(bandwidths) get_optimal_window <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) longer <- longer[which(longer$observation_type == "all"),] optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") return(optimal_bandwidth) } bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidths <- c(bandwidths, get_optimal_window(readme_df[i,])) } mean(bandwidths) median(bandwidths) bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidth <- get_optimal_window(readme_df[i,]) bandwidths <- c(bandwidths, bandwidth) } mean(bandwidths) median(bandwidths) get_optimal_window <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #this below line makes the code specific to the all-commits data longer <- longer[which(longer$observation_type == "all"),] result <- tryCatch({ optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") return(optimal_bandwidth) }, error = function(e){ return(8) }) } bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidth <- get_optimal_window(readme_df[i,]) bandwidths <- c(bandwidths, bandwidth) } mean(bandwidths) median(bandwidths) mode(bandwidths) table(bandwidths) mean(bandwidths) # median(bandwidths) # this is the file with the lmer multi-level rddAnalysis # 0 loading the readme data in try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") # 1 preprocessing colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") readme_df <- readme_df[,col_order] readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") readme_df = readme_df[,!(names(readme_df) %in% drop)] # 2 some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer[which(longer$observation_type == "all"),] return(longer) } expanded_data <- expand_timeseries(readme_df[1,]) for (i in 2:nrow(readme_df)){ expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) } View(expanded_data) View(expanded_data) View(expanded_data) View(expanded_data) View(expanded_data) get_optimal_window <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #this below line makes the code specific to the all-commits data longer <- longer[which(longer$observation_type == "all"),] result <- tryCatch({ #Imbens-Kalyanaraman Optimal Bandwidth Calculation optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") return(optimal_bandwidth) }, error = function(e){ return(9) }) } #this just gets the optimal bandwith window for each project and then appends to lists bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidth <- get_optimal_window(readme_df[i,]) bandwidths <- c(bandwidths, bandwidth) } mean(bandwidths) #8.574233 median(bandwidths) #8.363088 table(bandwidths) #filter out the timewindows window_num <- 8 expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) # 3 rdd in lmer analysis library(lme4) draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, data=expanded_data[which(longer$observation_type == "all"),]) expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) |> mutate(D = ifelse(week >= 26, 1, 0)) # 3 rdd in lmer analysis library(lme4) draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, data=expanded_data[which(longer$observation_type == "all"),]) summary(draft_model) View(expanded_data) #filter out the timewindows window_num <- 8 expanded_data <- expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) |> mutate(D = ifelse(week >= 26, 1, 0)) draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, data=expanded_data[which(longer$observation_type == "all"),]) summary(draft_model) draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) summary(draft_model) # this is the file with the lmer multi-level rddAnalysis # 0 loading the readme data in try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") # this is the file with the lmer multi-level rddAnalysis library(tidyverse) # 0 loading the readme data in try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") # 1 preprocessing colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") readme_df <- readme_df[,col_order] readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") readme_df = readme_df[,!(names(readme_df) %in% drop)] # 2 some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer[which(longer$observation_type == "all"),] return(longer) } expanded_data <- expand_timeseries(readme_df[1,]) for (i in 2:nrow(readme_df)){ expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) } library(plyr) # 2 some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer[which(longer$observation_type == "all"),] return(longer) } expanded_data <- expand_timeseries(readme_df[1,]) for (i in 2:nrow(readme_df)){ expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) } #filter out the timewindows window_num <- 8 expanded_data <- expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) |> mutate(D = ifelse(week >= 26, 1, 0)) # 3 rdd in lmer analysis library(lme4) draft_model <- lmer(count ~ D * I(week - 26) + (1|as.factor(upstream_vcs_link)), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) draft_model <- lmer(count ~ D * I(week - 26) + (1|as.factor(upstream_vcs_link)), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) summary(draft_model) # this is the file with the lmer multi-level rddAnalysis library(tidyverse) library(plyr) # 0 loading the readme data in try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") # 1 preprocessing colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") readme_df <- readme_df[,col_order] readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") readme_df = readme_df[,!(names(readme_df) %in% drop)] # 2 some expansion needs to happens for each project expand_timeseries <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #longer <- longer[which(longer$observation_type == "all"),] return(longer) } expanded_data <- expand_timeseries(readme_df[1,]) for (i in 2:nrow(readme_df)){ expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) } #filter out the timewindows window_num <- 8 expanded_data <- expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) |> mutate(D = ifelse(week >= 26, 1, 0)) # 3 rdd in lmer analysis library(lme4) draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) summary(draft_model) expanded_data <- expanded_data |> filter(week >= (26 - window_num) & week <= (26 + window_num)) |> mutate(D = ifelse(week > 26, 1, 0)) # 3 rdd in lmer analysis # rdd: https://rpubs.com/phle/r_tutorial_regression_discontinuity_design # lmer: https://www.youtube.com/watch?v=LzAwEKrn2Mc library(lme4) draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) summary(draft_model) View(expanded_data) draft_all_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) summary(draft_all_model) draft_mrg_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "mrg"),]) summary(draft_mrg_model) draft_all_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=TRUE, data=expanded_data[which(expanded_data$observation_type == "all"),]) summary(draft_all_model) summary(draft_all_model)