# test data directory: /gscratch/comdata/users/mjilg/program_testing/ # load in the paritioned directories library(dplyr) library(lubridate) #for a given file we want to get the count data and produce a csv readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_publication_commits.csv" contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_publication_commits.csv" readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/readme/" contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/contributing/" test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv" test_df <- read.csv(test_file, header = TRUE) transform_commit_data <- function(filepath, ref_df){ #basic, loading in the file df = read.csv(filepath, header = TRUE) #temp_df <- df dir_path = dirname(filepath) file_name = basename(filepath) # isolate project id project_id <- sub("_commits\\.csv$", "", file_name) project_id <- sub("^_", "", project_id) #make sure the dates are formatted correctly and state the project_id df <- df |> mutate(commit_date = ymd_hms(commit_date, tz = "UTC")) |> mutate(project_id = project_id) head(df) #find the publication entry, in the specified df df <- df |> mutate(has_readme = ifelse(grepl("README", diff_info), 1, 0))|> select(-diff_info) df_has_readme <- df |> filter(has_readme == 1) |> arrange(commit_date) print(head(df_has_readme)) if(nrow(df_has_readme) < 2) { date_delta <- "NA" } first_date <- as.Date(df_has_readme$commit_date[1]) second_date <- as.Date(df_has_readme$commit_date[2]) date_delta <- as.numeric(difftime(second_date, first_date, units = "hours")) return(date_delta) } test_delta <- transform_commit_data(test_file, read.csv(readme_pub_info)) transform_directory_of_commit_data <- function(is_readme) { ref_df <- read.csv(contributing_pub_info) dir_path <- contributing_dir if (is_readme){ ref_df <- read.csv(readme_pub_info) dir_path <- readme_dir } delta_list <- list() file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE) for (filepath in file_list) { delta <- transform_commit_data(filepath, ref_df) delta_list <- append(delta_list, delta) } return(delta_list) } readme_dl <- transform_directory_of_commit_data(TRUE) readme_vec <- unlist(readme_dl) quantile(readme_vec, na.rm=TRUE) quantile(readme_vec, probs = seq(0, 1, by = 0.1), na.rm=TRUE) contributing_dl <- transform_directory_of_commit_data(FALSE) contributinng_vec <- unlist(contributing_dl.value) quantile(contributinng_vec, na.rm=TRUE) quantile(contributinng_vec, probs = seq(0, 1, by = 0.1), na.rm=TRUE)