diff --git a/R/didCleaning.R b/R/didCleaning.R index 2c15fc2..22b4810 100644 --- a/R/didCleaning.R +++ b/R/didCleaning.R @@ -1,5 +1,6 @@ library(plyr) library(tidyverse) +library(rdd) #set wd, read in data @@ -32,17 +33,46 @@ readme_df = readme_df[,!(names(readme_df) %in% drop)] # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step -new_test <- readme_df[231,] -longer <- new_test |> - pivot_longer(cols = starts_with("ct"), - names_to = "window", - values_to = "count") |> - unnest(count) -longer$observation_type <- gsub("^.*_", "", longer$window) -longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) -longer$count <- as.numeric(longer$count) +# https://rpubs.com/phle/r_tutorial_regression_discontinuity_design +## https://www.rdocumentation.org/packages/lme4/versions/1.1-35.2/topics/lmer +new_test <- readme_df[9,] +get_optimal_window <- function(project_row) { + longer <- project_row |> + pivot_longer(cols = starts_with("ct"), + names_to = "window", + values_to = "count") |> + unnest(count) + longer$observation_type <- gsub("^.*_", "", longer$window) + longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) + longer$count <- as.numeric(longer$count) + #this below line makes the code specific to the all-commits data + longer <- longer[which(longer$observation_type == "all"),] + result <- tryCatch({ + #Imbens-Kalyanaraman Optimal Bandwidth Calculation + optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") + return(optimal_bandwidth) + }, error = function(e){ + return(8) + }) +} +bandwidths <- c() +for (i in 1:nrow(readme_df)){ + bandwidth <- get_optimal_window(readme_df[i,]) + bandwidths <- c(bandwidths, bandwidth) +} +mean(bandwidths) #8.574233 +median(bandwidths) #8.363088 +table(bandwidths) +#window_num <- 13 + +#longer <- longer %>% +# filter(week >= (26 - window_num) & week <= (26 + window_num)) #sapply(longer, class) +#longer$biweekly <- ceiling(longer$week / 2) +#longer <- longer %>% +# group_by(window, biweekly, observation_type) %>% +# summarise(biweekly_count = sum(count, na.rm = TRUE)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> @@ -60,4 +90,6 @@ longer[which(longer$observation_type == "all"),] |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + - geom_smooth(se = FALSE) + geom_smooth(se = TRUE) + + geom_vline(xintercept = 26) +