1
0
govdoc-cr-analysis/cleaning_scripts/verifying_commit_activity.R

84 lines
2.9 KiB
R
Raw Normal View History

2025-02-06 21:51:46 +00:00
# test data directory: /gscratch/comdata/users/mjilg/program_testing/
# load in the paritioned directories
library(dplyr)
library(lubridate)
#for a given file we want to get the count data and produce a csv
readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_publication_commits.csv"
contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_publication_commits.csv"
readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/readme/"
contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/contributing/"
test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv"
test_df <- read.csv(test_file, header = TRUE)
transform_commit_data <- function(filepath, ref_df){
#basic, loading in the file
df = read.csv(filepath, header = TRUE)
#temp_df <- df
dir_path = dirname(filepath)
file_name = basename(filepath)
# isolate project id
project_id <- sub("_commits\\.csv$", "", file_name)
project_id <- sub("^_", "", project_id)
#make sure the dates are formatted correctly and state the project_id
df <- df |>
mutate(commit_date = ymd_hms(commit_date, tz = "UTC")) |>
mutate(project_id = project_id)
head(df)
#find the publication entry, in the specified df
df <- df |>
mutate(has_readme = ifelse(grepl("README", diff_info), 1, 0))|>
select(-diff_info)
df_has_readme <- df |>
filter(has_readme == 1) |>
arrange(commit_date)
print(head(df_has_readme))
if(nrow(df_has_readme) < 2) {
date_delta <- "NA"
}
first_date <- as.Date(df_has_readme$commit_date[1])
second_date <- as.Date(df_has_readme$commit_date[2])
date_delta <- as.numeric(difftime(second_date, first_date, units = "hours"))
return(date_delta)
}
test_delta <- transform_commit_data(test_file, read.csv(readme_pub_info))
transform_directory_of_commit_data <- function(is_readme) {
ref_df <- read.csv(contributing_pub_info)
dir_path <- contributing_dir
if (is_readme){
ref_df <- read.csv(readme_pub_info)
dir_path <- readme_dir
}
delta_list <- list()
file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE)
for (filepath in file_list) {
delta <- transform_commit_data(filepath, ref_df)
delta_list <- append(delta_list, delta)
}
return(delta_list)
}
readme_dl <- transform_directory_of_commit_data(TRUE)
readme_vec <- unlist(readme_dl)
quantile(readme_vec, na.rm=TRUE)
quantile(readme_vec, probs = seq(0, 1, by = 0.1), na.rm=TRUE)
contributing_dl <- transform_directory_of_commit_data(FALSE)
contributinng_vec <- unlist(contributing_dl.value)
quantile(contributinng_vec, na.rm=TRUE)
quantile(contributinng_vec, probs = seq(0, 1, by = 0.1), na.rm=TRUE)