diff --git a/cleaning_scripts/get_weekly_commit_counts.R b/cleaning_scripts/get_weekly_commit_counts.R index 3872fa7..dae4052 100644 --- a/cleaning_scripts/get_weekly_commit_counts.R +++ b/cleaning_scripts/get_weekly_commit_counts.R @@ -5,41 +5,145 @@ library(dplyr) library(lubridate) #for a given file we want to get the count data and produce a csv -test_file <- "/gscratch/comdata/users/mjilg/program_testing/core_2012-01-01_to_2014-12-31.csv" -test_dir <- "/gscratch/comdata/users/mjilg/program_testing/" +readme_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/13125_test_README_publication_commits.csv" +contributing_pub_info <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/13125_test_CONTRIBUTING_publication_commits.csv" +readme_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/readme/" +contributing_dir <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/" -transform_commit_data <- function(filepath){ +test_file <- "/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/main_commit_data/contributing/_voxpupuli_beaker_commits.csv" + +transform_commit_data <- function(filepath, ref_df){ + #basic, loading in the file df = read.csv(filepath, header = TRUE) + temp_df <- df dir_path = dirname(filepath) file_name = basename(filepath) - # transform the rows of commit data to weekly count data - project_name <- sub("_[0-9]{4}-[0-9]{2}-[0-9]{2}_to_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv$", "", file_name) + # isolate project id + project_id <- sub("_commits\\.csv$", "", file_name) + project_id <- sub("^_", "", project_id) + #make sure the dates are formatted correctly and state the project_id df <- df |> mutate(commit_date = ymd_hms(commit_date)) |> - mutate(project_name = project_name) + mutate(project_id = project_id) + #find the publication entry, in the specified df + matched_entry <- ref_df |> + filter(repo_id == project_id) + commit_date <- as.Date(matched_entry$commit_date) + + #get information about project age either in the "present" + #or at the time of first commit + oldest_commit_date <- min(as.Date(df$commit_date)) + project_age <- as.numeric(as.Date("2024-06-24") - oldest_commit_date) + age_at_commit <- as.numeric(commit_date - oldest_commit_date) + + #add that to the data + df <- df |> + mutate(age = project_age, + age_at_commit = age_at_commit) + + #we are looking at weekly data, 6m before and 6m after + start_date <- commit_date %m-% months(6) + end_date <- commit_date %m+% months(6) + introduction_week <- floor_date(commit_date, "week") + + #filler for when there are weeks without commits + all_weeks <- seq.Date(floor_date(start_date, "week"), floor_date(end_date, "week"), by = "week") + complete_weeks_df <- expand.grid(week = all_weeks, + project_id = project_id, + age = project_age, + age_at_commit = age_at_commit) + + #add a column with the floored week + df <- df |> + mutate(week = floor_date(commit_date, "week")) + + #for each week, get the list of unique authors that committed + cumulative_authors <- df %>% + arrange(week) %>% + group_by(week) %>% + summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop') + #same for each committer + cumulative_committers <- df %>% + arrange(week) %>% + group_by(week) %>% + summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop') + + #now cut out the commit data that we don't care about + df <- df |> + filter(as.Date(commit_date) >= start_date & as.Date(commit_date) <= end_date) + + #in order: + # - we group by project, week, ages + # - and we summarize commit and authorship details + # - we then fill in information for missingness + # - and add in vars for before/after + # - and weekly index weekly_commits <- df |> - mutate(week = floor_date(commit_date, "week")) |> - group_by(week, project_name) |> - summarise(commit_count = n(), .groups = 'drop') + group_by(project_id, week, age, age_at_commit) |> + summarise(commit_count = n(), + author_emails = list(unique(author_email)), + committer_emails = list(unique(committer_email)), + .groups = 'drop') |> + right_join(complete_weeks_df, by=c("week", "project_id", "age", "age_at_commit")) |> + replace_na(list(commit_count = 0)) |> + mutate(before_after = if_else(week < floor_date(commit_date, "week"), 0, 1)) |> + mutate(week_index = as.integer(difftime(week, + introduction_week, + units = "weeks"))) + # then, to get the authorship details in + # we check if the email data is present, if not we fill in blank + # we bring in the information about authorship lists that we already had + # then comparing the current week's author list with the previous week's cumulative list, or empty + # ---- the length of that difference is the 'new' value + # then we delete out the author list information + weekly_with_authorship <- weekly_commits |> + mutate( + author_emails = ifelse(is.na(author_emails), list(character()), author_emails), + committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails) + ) |> + left_join(cumulative_authors, by = "week") |> + left_join(cumulative_committers, by = "week") |> + mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))), + new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1))))) |> + select(-author_emails, -committer_emails, -cumulative_author_emails, -cumulative_committer_emails) - #prepare to save the new, transformed file - count_path <- file.path(dir_path, "weekly_counts") - count_file_name <- paste0("weeklycount_", file_name) - output_file_path <- file.path(count_path, count_file_name) - #save and gracefully exit - write.csv(weekly_commits, output_file_path, row.names = FALSE) - return(weekly_commits) + #gracefully exit + return(weekly_with_authorship) } #then for all files in a directory -transform_directory_of_commit_data <- function(dir_path) { +transform_directory_of_commit_data <- function(is_readme) { + ref_df <- read.csv(contributing_pub_info) + dir_path <- contributing_dir + if (is_readme){ + ref_df <- read.csv(readme_pub_info) + dir_path <- readme_dir + } + counted_list <- list() file_list <- list.files(path = dir_path, pattern = "*.csv", full.names = TRUE) for (filepath in file_list) { - transform_commit_data(filepath) + transformed_data <- transform_commit_data(filepath, ref_df) + counted_list <- append(counted_list, list(transformed_data)) } + counted_df <- bind_rows(counted_list) + + return(counted_df) } -transform_directory_of_commit_data(test_dir) +#below is for contributing file +test_big_df <- transform_directory_of_commit_data(is_readme=FALSE) +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/013125_weekly_count_CONTRIBUTING.csv" +#below is for readme +#test_big_df <- transform_directory_of_commit_data(is_readme=TRUE) +#output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/govdoc-cr-data/13125_hyak_test/013125_weekly_count_README.csv" + +#validation testing +#length(unique(test_big_df$project_id)) +#filtered_df <- test_big_df %>% +# filter(commit_count != 0, new_author_emails == 0, new_committer_emails == 0) + +#another graceful exit +#test_big_df.to_csv(output_filepath, index=False) \ No newline at end of file diff --git a/rstudio-server.job b/rstudio-server.job index 3d9f565..3a9c9ac 100644 --- a/rstudio-server.job +++ b/rstudio-server.job @@ -3,7 +3,7 @@ #SBATCH --job-name=mg-govdoc-cr #SBATCH --partition=cpu-g2-mem2x #update this line - use hyakalloc to find partitions you can use -#SBATCH --time=03:00:00 +#SBATCH --time=04:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=4 #SBATCH --mem=64G