library(tidyverse) gerrit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0214_ve_gerrit_test.csv" input_df <- read.csv(gerrit_fp, header = TRUE) transform_gerrit_data <- function(data_fp, projects){ #loading in df <- read.csv(data_fp, header = TRUE) #some initial pre-processing df <- df |> filter(project %in% projects) |> mutate(created = ymd_hms(created)) |> mutate(updated = ymd_hms(updated)) |> mutate(submitted = ymd_hms(submitted)) event_date <- as.Date("2013-07-01") oldest_created_date <- min(as.Date(df$created)) calculated_start_date <- event_date %m-% months(24) start_date <- max(calculated_start_date, oldest_created_date) end_date <- event_date %m+% months(24) relative_week <- function(date, ref_date) { as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) } #going off of the created date df <- df |> mutate(relative_week = relative_week(created, event_date)) |> mutate(create_update_delta = as.numeric(difftime(updated, created, units="days"))) #creating filler zeros unique_projects <- unique(df$project) unique_statuses <- unique(df$status) all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) complete_weeks_df <- expand.grid( project = unique_projects, relative_week = all_weeks, status = unique_statuses ) weekly_commits <- df |> group_by(project, relative_week, status) |> summarise(task_count = n(), avg_resolution_time = mean(create_update_delta, na.rm = TRUE), avg_insertions = mean(insertions, na.rm=TRUE), avg_deletions = mean(deletions, na.rm=TRUE), .groups = 'drop') |> right_join(complete_weeks_df, by=c("project", "relative_week", "status")) |> replace_na(list(task_count = 0)) |> replace_na(list(avg_resolution_time = 0 )) return(weekly_commits) } transformed_data <- transform_gerrit_data(gerrit_fp, c("VisualEditor/VisualEditor", "mediawiki/extensions", "mediawiki/core", "mediawiki/extensions/VisualEditor")) output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv" write.csv(transformed_data, output_filepath, row.names = FALSE)