library(plyr) library(tidyverse) library(rdd) #set wd, read in data try(setwd(dirname(rstudioapi::getActiveDocumentContext()$path))) readme_df <- read_csv("../final_data/deb_readme_did.csv") contributing_df <- read_csv("../final_data/deb_contrib_did.csv") full_df <- read_csv("../final_data/deb_full_data.csv") #preprocessing for readme_df colnames(contributing_df) <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct", "before_auth_new", "after_commit_new", "after_auth_new", "before_commit_new") col_order <- c("upstream_vcs_link", "event_date", "event_hash", "before_all_ct", "after_all_ct", "before_mrg_ct", "after_mrg_ct", "before_auth_new", "after_auth_new", "before_commit_new", "after_commit_new") ages <- c() projects <- c() for (i in 1:nrow(contributing_df)){ link <- contributing_df[i,]$upstream_vcs_link age <- full_df$age_of_project[full_df$upstream_vcs_link == link] project <- full_df$project_name[full_df$upstream_vcs_link == link] ages <- c(ages, age) if (length(project) != 1){ project break } else { projects <- c(projects, project) } } contributing_df$age_of_project = ages write.csv(contributing_df, "deb_contributing_data_4_19.csv", row.names=FALSE) readme_df <- readme_df[,col_order] readme_df$ct_before_all <- str_split(gsub("[][]","", readme_df$before_all_ct), ", ") readme_df$ct_after_all <- str_split(gsub("[][]","", readme_df$after_all_ct), ", ") readme_df$ct_before_mrg <- str_split(gsub("[][]","", readme_df$before_mrg_ct), ", ") readme_df$ct_after_mrg <- str_split(gsub("[][]","", readme_df$after_mrg_ct), ", ") drop <- c("before_all_ct", "before_mrg_ct", "after_all_ct", "after_mrg_ct") readme_df = readme_df[,!(names(readme_df) %in% drop)] #preprocessing for contributing_df # test <- readme_df$cnt_before_all # as.numeric(unlist(test[1])) # test_two <- c() # iterator <- 0 # for (entry in test) { # readme_df$cnt_before_all[iterator] <- as.numeric(unlist(entry)) # print(as.numeric(unlist(entry))) # iterator <- iterator + 1 # } # test_two #Yes, need to expand the dataframe, but again, for the sake of clarity, do not want to until analysis step # https://rpubs.com/phle/r_tutorial_regression_discontinuity_design ## https://www.rdocumentation.org/packages/lme4/versions/1.1-35.2/topics/lmer new_test <- readme_df[9,] get_optimal_window <- function(project_row) { longer <- project_row |> pivot_longer(cols = starts_with("ct"), names_to = "window", values_to = "count") |> unnest(count) longer$observation_type <- gsub("^.*_", "", longer$window) longer <- ddply(longer, "observation_type", transform, week=seq(from=0, by=1, length.out=length(observation_type))) longer$count <- as.numeric(longer$count) #this below line makes the code specific to the all-commits data longer <- longer[which(longer$observation_type == "all"),] result <- tryCatch({ #Imbens-Kalyanaraman Optimal Bandwidth Calculation optimal_bandwidth <- IKbandwidth(longer$week, longer$count, cutpoint = 26, verbose = FALSE, kernel = "triangular") return(optimal_bandwidth) }, error = function(e){ #have tested it with multiple different error-values and all medians/means still hover around 8 return(8) }) } #this just gets the optimal bandwith window for each project and then appends to lists bandwidths <- c() for (i in 1:nrow(readme_df)){ bandwidth <- get_optimal_window(readme_df[i,]) bandwidths <- c(bandwidths, bandwidth) } mean(bandwidths) #8.574233 median(bandwidths) #8.363088 table(bandwidths) #from this, I think setting the bandwidth to 8 weeks, two months, the floor # of both the median and mean calculations #longer <- longer %>% # filter(week >= (26 - window_num) & week <= (26 + window_num)) #sapply(longer, class) #longer$biweekly <- ceiling(longer$week / 2) #longer <- longer %>% # group_by(window, biweekly, observation_type) %>% # summarise(biweekly_count = sum(count, na.rm = TRUE)) #testing out analysis below longer[which(longer$observation_type == "all"),] |> ggplot(aes(x = week, y = count)) + geom_point() + geom_vline(xintercept = 26) longer[which(longer$observation_type == "all"),] |> mutate(D = ifelse(week >= 26, 1, 0)) |> lm(formula = count ~ D * I(week - 26)) |> summary() longer[which(longer$observation_type == "all"),] |> select(count, week) |> mutate(D = as.factor(ifelse(week >= 26, 1, 0))) |> ggplot(aes(x = week, y = count, color = D)) + geom_point() + geom_smooth(se = TRUE) + geom_vline(xintercept = 26)