diff --git a/R/.Rhistory b/R/.Rhistory index a8a84cc..b1eb01c 100644 --- a/R/.Rhistory +++ b/R/.Rhistory @@ -1,68 +1,3 @@ -geom_vline(xintercept = 26) -window <- 26 -longer <- longer %>% -filter(week >= (26 - window) & week <= (26 + window)) -window_num <- 26 -longer <- longer %>% -filter(week >= (26 - window_num) & week <= (26 + window_num)) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -longer[which(longer$observation_type == "all"),] |> -mutate(D = ifelse(week >= 26, 1, 0)) |> -lm(formula = count ~ D * I(week - 26)) |> -summary() -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> -mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth(se = FALSE) + -geom_vline(xintercept = 26) -window_num <- 27 -longer <- longer %>% -filter(week >= (26 - window_num) & week <= (26 + window_num)) -# test_two <- c() -# iterator <- 0 -# for (entry in test) { -# readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) -# print(as.numeric(unlist(entry))) -# iterator <- iterator + 1 -# } -# test_two -#Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design -new_test <- readme_df[450,] -longer <- new_test |> -pivot_longer(cols = starts_with("ct"), -names_to = "window", -values_to = "count") |> -unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) -window_num <- 27 -longer <- longer %>% -filter(week >= (26 - window_num) & week <= (26 + window_num)) -#testing out analysis below -longer[which(longer$observation_type == "all"),] |> -ggplot(aes(x = week, y = count)) + -geom_point() + -geom_vline(xintercept = 26) -longer[which(longer$observation_type == "all"),] |> -mutate(D = ifelse(week >= 26, 1, 0)) |> -lm(formula = count ~ D * I(week - 26)) |> -summary() -longer[which(longer$observation_type == "all"),] |> -select(count, week) |> -mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> -ggplot(aes(x = week, y = count, color = D)) + -geom_point() + -geom_smooth(se = FALSE) + -geom_vline(xintercept = 26) -window_num <- 20 longer <- longer %>% filter(week >= (26 - window_num) & week <= (26 + window_num)) #testing out analysis below @@ -510,3 +445,68 @@ draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, REML=FALSE, da draft_model <- lmer(count ~ D * I(week - 26) + upstream_vcs_link, REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) summary(draft_model) +# this is the file with the lmer multi-level rddAnalysis +# 0 loading the readme data in +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +# this is the file with the lmer multi-level rddAnalysis +library(tidyverse) +# 0 loading the readme data in +try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) +readme_df <- read_csv("../final_data/deb_readme_did.csv") +# 1 preprocessing +colnames(readme_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") +col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") +readme_df <- readme_df[,col_order] +readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") +readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") +readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") +readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") +drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") +readme_df = readme_df[,!(names(readme_df) %in% drop)] +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer[which(longer$observation_type == "all"),] +return(longer) +} +expanded_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) +} +library(plyr) +# 2 some expansion needs to happens for each project +expand_timeseries <- function(project_row) { +longer <- project_row |> +pivot_longer(cols = starts_with("ct"), +names_to = "window", +values_to = "count") |> +unnest(count) +longer$observation_type <- gsub("^.*_", "", longer$window) +longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) +longer$count <- as.numeric(longer$count) +#longer <- longer[which(longer$observation_type == "all"),] +return(longer) +} +expanded_data <- expand_timeseries(readme_df[1,]) +for (i in 2:nrow(readme_df)){ +expanded_data <- rbind(expanded_data, expand_timeseries(readme_df[i,])) +} +#filter out the timewindows +window_num <- 8 +expanded_data <- expanded_data |> +filter(week >= (26 - window_num) & week <= (26 + window_num)) |> +mutate(D = ifelse(week >= 26, 1, 0)) +# 3 rdd in lmer analysis +library(lme4) +draft_model <- lmer(count ~ D * I(week - 26) + (1|as.factor(upstream_vcs_link)), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) +draft_model <- lmer(count ~ D * I(week - 26) + (1|as.factor(upstream_vcs_link)), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) +draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) +summary(draft_model) diff --git a/R/didAnalysis.R b/R/didAnalysis.R index baa9fe5..d3890d3 100644 --- a/R/didAnalysis.R +++ b/R/didAnalysis.R @@ -1,4 +1,6 @@ # this is the file with the lmer multi-level rddAnalysis +library(tidyverse) +library(plyr) # 0 loading the readme data in try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") @@ -36,5 +38,5 @@ expanded_data <- expanded_data |> mutate(D = ifelse(week >= 26, 1, 0)) # 3 rdd in lmer analysis library(lme4) -draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(longer$observation_type == "all"),]) +draft_model <- lmer(count ~ D * I(week - 26) + (1|upstream_vcs_link), REML=FALSE, data=expanded_data[which(expanded_data$observation_type == "all"),]) summary(draft_model)