library(tidyverse) # test data directory: /gscratch/comdata/users/mjilg/program_testing/ # load in the paritioned directories library(dplyr) library(lubridate) #for a given file we want to get the count data and produce a csv readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/README_publication_commits.csv" contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/metadata/CONTRIBUTING_publication_commits.csv" readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/readme/" contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/main_commit_data/contributing/" #test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv" transform_commit_data <- function(filepath, ref_df){ #basic, loading in the file df = read.csv(filepath, header = TRUE) temp_df <- df dir_path = dirname(filepath) file_name = basename(filepath) # isolate project id project_id <- sub("_commits\\.csv$", "", file_name) project_id <- sub("^_", "", project_id) #make sure the dates are formatted correctly and state the project_id df <- df |> mutate(commit_date = ymd_hms(commit_date)) |> mutate(project_id = project_id) #find the publication entry, in the specified df matched_entry <- ref_df |> filter(repo_id == project_id) commit_date <- min(as.Date(matched_entry$commit_date)) #get information about project age either in the "present" #or at the time of first commit oldest_commit_date <- min(as.Date(df$commit_date)) project_age <- as.numeric(as.Date("2024-06-24") - oldest_commit_date) age_at_commit <- as.numeric(commit_date - oldest_commit_date) #add that to the data df <- df |> mutate(age = project_age, age_at_commit = age_at_commit) #we are looking at weekly data, 6m before and 6m after start_date <- commit_date %m-% months(6) end_date <- commit_date %m+% months(6) introduction_week <- floor_date(commit_date, "week") #filler for when there are weeks without commits all_weeks <- seq.Date(floor_date(start_date, "week"), floor_date(end_date, "week"), by = "week") complete_weeks_df <- expand.grid(week = all_weeks, project_id = project_id, age = project_age, age_at_commit = age_at_commit) #add a column with the floored week df <- df |> mutate(week = floor_date(commit_date, "week")) #for each week, get the list of unique authors that committed cumulative_authors <- df %>% arrange(week) %>% group_by(week) %>% summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop') #same for each committer cumulative_committers <- df %>% arrange(week) %>% group_by(week) %>% summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop') #now cut out the commit data that we don't care about df <- df |> filter(as.Date(commit_date) >= start_date & as.Date(commit_date) <= end_date) #in order: # - we group by project, week, ages # - and we summarize commit and authorship details # - we then fill in information for missingness # - and add in vars for before/after # - and weekly index weekly_commits <- df |> group_by(project_id, week, age, age_at_commit) |> summarise(commit_count = n(), author_emails = list(unique(author_email)), committer_emails = list(unique(committer_email)), .groups = 'drop') |> right_join(complete_weeks_df, by=c("week", "project_id", "age", "age_at_commit")) |> replace_na(list(commit_count = 0)) |> mutate(before_after = if_else(week < floor_date(commit_date, "week"), 0, 1)) |> mutate(week_index = as.integer(difftime(week, introduction_week, units = "weeks"))) # then, to get the authorship details in # we check if the email data is present, if not we fill in blank # we bring in the information about authorship lists that we already had # then comparing the current week's author list with the previous week's cumulative list, or empty # ---- the length of that difference is the 'new' value # then we delete out the author list information weekly_with_authorship <- weekly_commits |> mutate( author_emails = ifelse(is.na(author_emails), list(character()), author_emails), committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails) ) |> left_join(cumulative_authors, by = "week") |> left_join(cumulative_committers, by = "week") |> mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))), new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1))))) |> select(-author_emails, -committer_emails, -cumulative_author_emails, -cumulative_committer_emails) #gracefully exit return(weekly_with_authorship) } #then for all files in a directory transform_directory_of_commit_data <- function(is_readme) { ref_df <- read.csv(contributing_pub_info) dir_path <- contributing_dir if (is_readme){ ref_df <- read.csv(readme_pub_info) dir_path <- readme_dir } counted_list <- list() file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE) for (filepath in file_list) { transformed_data <- transform_commit_data(filepath, ref_df) counted_list <- append(counted_list, list(transformed_data)) } counted_df <- bind_rows(counted_list) return(counted_df) } #below is for contributing file #test_big_df <- transform_directory_of_commit_data(is_readme=FALSE) #output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/CONTRIBUTING_weekly_count_data.csv" #below is for readme big_df <- transform_directory_of_commit_data(is_readme=TRUE) output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/final_data/README_weekly_count_data.csv" #validation testing length(unique(big_df$project_id)) #filtered_df <- test_big_df %>% # filter(commit_count != 0, new_author_emails == 0, new_committer_emails == 0) #another graceful exit write.csv(big_df, output_filepath, row.names = FALSE)