84 lines
2.9 KiB
R
84 lines
2.9 KiB
R
# test data directory: /gscratch/comdata/users/mjilg/program_testing/
|
|
# load in the paritioned directories
|
|
library(dplyr)
|
|
library(lubridate)
|
|
|
|
#for a given file we want to get the count data and produce a csv
|
|
readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_publication_commits.csv"
|
|
contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_publication_commits.csv"
|
|
readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/readme/"
|
|
contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/contributing/"
|
|
|
|
test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv"
|
|
test_df <- read.csv(test_file, header = TRUE)
|
|
|
|
|
|
transform_commit_data <- function(filepath, ref_df){
|
|
#basic, loading in the file
|
|
df = read.csv(filepath, header = TRUE)
|
|
#temp_df <- df
|
|
dir_path = dirname(filepath)
|
|
file_name = basename(filepath)
|
|
|
|
# isolate project id
|
|
project_id <- sub("_commits\\.csv$", "", file_name)
|
|
project_id <- sub("^_", "", project_id)
|
|
|
|
#make sure the dates are formatted correctly and state the project_id
|
|
df <- df |>
|
|
mutate(commit_date = ymd_hms(commit_date, tz = "UTC")) |>
|
|
mutate(project_id = project_id)
|
|
head(df)
|
|
#find the publication entry, in the specified df
|
|
|
|
df <- df |>
|
|
mutate(has_readme = ifelse(grepl("README", diff_info), 1, 0))|>
|
|
select(-diff_info)
|
|
|
|
df_has_readme <- df |>
|
|
filter(has_readme == 1) |>
|
|
arrange(commit_date)
|
|
|
|
print(head(df_has_readme))
|
|
|
|
if(nrow(df_has_readme) < 2) {
|
|
date_delta <- "NA"
|
|
}
|
|
|
|
first_date <- as.Date(df_has_readme$commit_date[1])
|
|
second_date <- as.Date(df_has_readme$commit_date[2])
|
|
|
|
date_delta <- as.numeric(difftime(second_date, first_date, units = "hours"))
|
|
|
|
return(date_delta)
|
|
}
|
|
|
|
test_delta <- transform_commit_data(test_file, read.csv(readme_pub_info))
|
|
|
|
transform_directory_of_commit_data <- function(is_readme) {
|
|
ref_df <- read.csv(contributing_pub_info)
|
|
dir_path <- contributing_dir
|
|
if (is_readme){
|
|
ref_df <- read.csv(readme_pub_info)
|
|
dir_path <- readme_dir
|
|
}
|
|
delta_list <- list()
|
|
file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE)
|
|
for (filepath in file_list) {
|
|
delta <- transform_commit_data(filepath, ref_df)
|
|
delta_list <- append(delta_list, delta)
|
|
}
|
|
|
|
return(delta_list)
|
|
}
|
|
|
|
readme_dl <- transform_directory_of_commit_data(TRUE)
|
|
readme_vec <- unlist(readme_dl)
|
|
quantile(readme_vec, na.rm=TRUE)
|
|
quantile(readme_vec, probs = seq(0, 1, by = 0.1), na.rm=TRUE)
|
|
|
|
contributing_dl <- transform_directory_of_commit_data(FALSE)
|
|
contributinng_vec <- unlist(contributing_dl.value)
|
|
quantile(contributinng_vec, na.rm=TRUE)
|
|
quantile(contributinng_vec, probs = seq(0, 1, by = 0.1), na.rm=TRUE)
|