diff --git a/R/.Rhistory b/R/.Rhistory index f484971..a8a84cc 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,488 +1,51 @@ -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count)) -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -as.numeric(unlist(count)) -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -unlist(count) -View(new_longer) -new_longer -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -unlist(count) |> -as.numeric(count) -View(new_longer) -new_longer -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -unlist(count) |> -as.numeric(count) -longer -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -unlist(count) -longer -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer -View(longer) -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -as.numeric(count) -longer -longer <- new_test |> -pivot_longer(cols = starts_with("cnt"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer -library(tidyverse) -#set wd, read in data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -#preprocessing for readme_df -colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") -readme_df$cnt_after_all <- str_split(gsub("[][]","", readme_df$after_all_cnt), ", ") -readme_df$cnt_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_cnt), ", ") -readme_df$cnt_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_cnt), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -library(tidyverse) -#set wd, read in data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -#preprocessing for readme_df -colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$cnt_before_all <- str_split(gsub("[][]","", readme_df$before_all_cnt), ", ") -library(tidyverse) -#set wd, read in data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -contributing_df <- read_csv("../final_data/deb_contrib_did.csv") -#preprocessing for readme_df -colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- head(readme_df, 1) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer -View(longer) -library(tidyverse) -#set wd, read in data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -contributing_df <- read_csv("../final_data/deb_contrib_did.csv") -#preprocessing for readme_df -colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- head(readme_df, 1) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer -View(longer) -longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window))) -library(plyr) -longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window))) -View(longer) -library(plyr) -library(tidyverse) -#set wd, read in data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -contributing_df <- read_csv("../final_data/deb_contrib_did.csv") -#preprocessing for readme_df -colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- head(readme_df, 1) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer <- ddply(longer, "window", transform, t=seq(from=0, by=1, length.out=length(window))) -View(longer) -longer <- ddply(longer, strsplit("window", split="_")[-1], transform, week=seq(from=0, by=1, length.out=length(window))) -longer <- ddply(longer, strsplit(window, split="_")[-1], transform, week=seq(from=0, by=1, length.out=length(window))) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -add_column(rel = gsub("^.*_", "", window)) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) |> -add_column(rel = gsub("^.*_", "", "window")) -View(longer) -longer$rel <- gsub("^.*_", "", longer$window) -View(longer) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- head(readme_df, 1) -new_testr$observation_type <- gsub("^.*_", "", new_test$window) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- head(readme_df, 1) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -View(longer) -head(longer) -sapply(longer, class) -library(plyr) -library(tidyverse) -#set wd, read in data -try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) -readme_df <- read_csv("../final_data/deb_readme_did.csv") -contributing_df <- read_csv("../final_data/deb_contrib_did.csv") -#preprocessing for readme_df -colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") -col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") -readme_df <- readme_df[,col_order] -readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") -readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") -readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") -readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") -drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") -readme_df = readme_df[,!(names(readme_df) %in% drop)] -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- head(readme_df, 1) -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -View(longer) -#testing out analysis below -longer[which(longer$observation_type == all)] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -#testing out analysis below -longer[which(longer$observation_type == "all")] |> -ggplot(aes(x = week, y = count)) + -geom_point() + geom_vline(xintercept = 26) +window <- 26 +longer <- longer %>% +filter(week >= (26 - window) & week <= (26 + window)) +window_num <- 26 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) -View(readme_df) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[5,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -View(readme_df) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[76,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[77,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[143,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[185,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -# as.numeric(unlist(test[1])) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[231,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -longer[which(longer$observation_type == "all"),] |> -mutate(D = ifelse(week >= 26, 1, 0)) |> -lm(count ~ D + I(week - 26)) |> -summary() -longer[which(longer$observation_type == "all"),] |> -mutate(D = ifelse(week >= 26, 1, 0)) |> -lm(count ~ D * I(week - 26)) |> -summary() longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> -mutate(D = ifelse(week >= 26, 1, 0)) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth(method = "lm") -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + -geom_smooth(method = "lm") -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> -mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth() -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> -mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth() -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> -mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth(aes(x = week, y = count, color = D)) -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> -mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth() -sapply(longer, class) +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +window_num <- 27 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[450,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) -sapply(longer, class) +window_num <- 27 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + @@ -497,16 +60,453 @@ select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + -geom_smooth() +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +window_num <- 20 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + -geom_smooth(se = False) +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +window_num <- 4 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + -geom_smooth(se = FALSE) +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +window_num <- 10 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[450,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +window_num <- 10 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[697,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +window_num <- 27 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +window_num <- 13 +longer <- longer %>% +filter(week >= (26 - window_num) & week <= (26 + window_num)) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = FALSE) + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = TRUE) + +geom_vline(xintercept = 26) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 25.5) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = TRUE) + +geom_vline(xintercept = 25.5) +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = TRUE) + +geom_vline(xintercept = 26) +library(rdd-package) +library(rdd) +library(rdd) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[697,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer %>% +# filter(week >= (26 - window_num) & week <= (26 + window_num)) +IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +#testing out analysis below +longer[which(longer$observation_type == "all"),] |> +ggplot(aes(x = week, y = count)) + +geom_point() + +geom_vline(xintercept = 26) +longer[which(longer$observation_type == "all"),] |> +mutate(D = ifelse(week >= 26, 1, 0)) |> +lm(formula = count ~ D * I(week - 26)) |> +summary() +longer[which(longer$observation_type == "all"),] |> +select(count, week) |> +mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> +ggplot(aes(x = week, y = count, color = D)) + +geom_point() + +geom_smooth(se = TRUE) + +geom_vline(xintercept = 26) +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[0,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer %>% +# filter(week >= (26 - window_num) & week <= (26 + window_num)) +IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[3,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer %>% +# filter(week >= (26 - window_num) & week <= (26 + window_num)) +IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +# test_two <- c() +# iterator <- 0 +# for (entry in test) { +# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) +# print(as.numeric(unlist(entry))) +# iterator <- iterator + 1 +# } +# test_two +#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +new_test <- readme_df[9,] +longer <- new_test |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer %>% +# filter(week >= (26 - window_num) & week <= (26 + window_num)) +IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +get_optimal_window <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +return(optimal_bandwidth) +} +bandwidths <- c() +for (i in 1:nrow(readme_df)){ +bandwidths <- c(bandwidths, get_optimal_window(readme_df[i,])) +} +bandwidths +mean(bandwidths) +median(bandwidths) +get_optimal_window <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +longer <- longer[which(longer$observation_type == "all"),] +optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +return(optimal_bandwidth) +} +bandwidths <- c() +for (i in 1:nrow(readme_df)){ +bandwidths <- c(bandwidths, get_optimal_window(readme_df[i,])) +} +mean(bandwidths) +median(bandwidths) +bandwidths <- c() +for (i in 1:nrow(readme_df)){ +bandwidth <- get_optimal_window(readme_df[i,]) +bandwidths <- c(bandwidths, bandwidth) +} +mean(bandwidths) +median(bandwidths) +get_optimal_window <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#this below line makes the code specific to the all-commits data +longer <- longer[which(longer$observation_type == "all"),] +result <- tryCatch({ +optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +return(optimal_bandwidth) +}, error = function(e){ +return(8) +}) +} +bandwidths <- c() +for (i in 1:nrow(readme_df)){ +bandwidth <- get_optimal_window(readme_df[i,]) +bandwidths <- c(bandwidths, bandwidth) +} +mean(bandwidths) +median(bandwidths) +mode(bandwidths) +table(bandwidths) +mean(bandwidths) # +median(bandwidths) +# this is the file with the lmer multi-level rddAnalysis +# 0 loading the readme data in +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +# 1 preprocessing +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer[which(longer$observation_type == "all"),] +return(longer) +} +expanded_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) +} +View(expanded_data) +View(expanded_data) +View(expanded_data) +View(expanded_data) +View(expanded_data) +get_optimal_window <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#this below line makes the code specific to the all-commits data +longer <- longer[which(longer$observation_type == "all"),] +result <- tryCatch({ +#Imbens-Kalyanaraman Optimal Bandwidth Calculation +optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") +return(optimal_bandwidth) +}, error = function(e){ +return(9) +}) +} +#this just gets the optimal bandwith window for each project and then appends to lists +bandwidths <- c() +for (i in 1:nrow(readme_df)){ +bandwidth <- get_optimal_window(readme_df[i,]) +bandwidths <- c(bandwidths, bandwidth) +} +mean(bandwidths) #8.574233 +median(bandwidths) #8.363088 +table(bandwidths) +#filter out the timewindows +window_num <- 8 +expanded_data |> +filter(week >= (26 - window_num) & week <= (26 + window_num)) +expanded_data |> +filter(week >= (26 - window_num) & week <= (26 + window_num)) +# 3 rdd in lmer analysis +library(lme4) +draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, data=expanded_data[which(longer$observation_type == "all"),]) +expanded_data |> +filter(week >= (26 - window_num) & week <= (26 + window_num)) |> +mutate(D = ifelse(week >= 26, 1, 0)) +# 3 rdd in lmer analysis +library(lme4) +draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, data=expanded_data[which(longer$observation_type == "all"),]) +summary(draft_model) +View(expanded_data) +#filter out the timewindows +window_num <- 8 +expanded_data <- expanded_data |> +filter(week >= (26 - window_num) & week <= (26 + window_num)) |> +mutate(D = ifelse(week >= 26, 1, 0)) +draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, data=expanded_data[which(longer$observation_type == "all"),]) +summary(draft_model) +draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) +draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) +draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) +summary(draft_model) diff --git a/R/didAnalysis.R b/R/didAnalysis.R index e69de29..baa9fe5 100644 --- a/R/didAnalysis.R +++ b/R/didAnalysis.R @@ -0,0 +1,40 @@ +# this is the file with the lmer multi-level rddAnalysis +# 0 loading the readme data in +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +# 1 preprocessing +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { + longer <- project_row |> + pivot_longer(cols = starts_with("ct"), + names_to = "window", + values_to = "count") |> + unnest(count) + longer$observation_type <- gsub("^.*_", "", longer$window) + longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) + longer$count <- as.numeric(longer$count) + #longer <- longer[which(longer$observation_type == "all"),] + return(longer) +} +expanded_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ + expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) +} +#filter out the timewindows +window_num <- 8 +expanded_data <- expanded_data |> + filter(week >= (26 - window_num) & week <= (26 + window_num)) |> + mutate(D = ifelse(week >= 26, 1, 0)) +# 3 rdd in lmer analysis +library(lme4) +draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) +summary(draft_model) diff --git a/R/didCleaning.R b/R/didCleaning.R index 22b4810..aac1c99 100644 --- a/R/didCleaning.R +++ b/R/didCleaning.R @@ -52,19 +52,24 @@ get_optimal_window <- function(project_row) { optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") return(optimal_bandwidth) }, error = function(e){ + #have tested it with multiple different error-values and all medians/means still hover around 8 return(8) }) } +#this just gets the optimal bandwith window for each project and then appends to lists bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidth <- get_optimal_window(readme_df[i,]) bandwidths <- c(bandwidths, bandwidth) } -mean(bandwidths) #8.574233 +mean(bandwidths) +#8.574233 median(bandwidths) #8.363088 table(bandwidths) -#window_num <- 13 +#from this, I think setting the bandwidth to 8 weeks, two months, the floor +# of both the median and mean calculations + #longer <- longer %>% # filter(week >= (26 - window_num) & week <= (26 + window_num))