1
0
mw-lifecycle-analysis/gerrit_analysis/make_gerrit_count_data.R
2025-02-16 14:08:16 -08:00

65 lines
2.4 KiB
R

library(tidyverse)
gerrit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0214_ve_gerrit_test.csv"
input_df <- read.csv(gerrit_fp, header = TRUE)
transform_gerrit_data <- function(data_fp, projects){
#loading in
df <- read.csv(data_fp, header = TRUE)
#some initial pre-processing
df <- df |>
filter(project %in% projects) |>
mutate(created = ymd_hms(created)) |>
mutate(updated = ymd_hms(updated)) |>
mutate(submitted = ymd_hms(submitted))
event_date <- as.Date("2013-07-01")
oldest_created_date <- min(as.Date(df$created))
calculated_start_date <- event_date %m-% months(24)
start_date <- max(calculated_start_date, oldest_created_date)
end_date <- event_date %m+% months(24)
relative_week <- function(date, ref_date) {
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
}
#going off of the created date
df <- df |>
mutate(relative_week = relative_week(created, event_date)) |>
mutate(create_update_delta = as.numeric(difftime(updated, created, units="days")))
#creating filler zeros
unique_projects <- unique(df$project)
unique_statuses <- unique(df$status)
all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date))
complete_weeks_df <- expand.grid(
project = unique_projects,
relative_week = all_weeks,
status = unique_statuses
)
weekly_commits <- df |>
group_by(project, relative_week, status) |>
summarise(task_count = n(),
avg_resolution_time = mean(create_update_delta, na.rm = TRUE),
avg_insertions = mean(insertions, na.rm=TRUE),
avg_deletions = mean(deletions, na.rm=TRUE),
.groups = 'drop') |>
right_join(complete_weeks_df,
by=c("project", "relative_week", "status")) |>
replace_na(list(task_count = 0)) |>
replace_na(list(avg_resolution_time = 0 ))
return(weekly_commits)
}
transformed_data <- transform_gerrit_data(gerrit_fp, c("VisualEditor/VisualEditor",
"mediawiki/extensions",
"mediawiki/core",
"mediawiki/extensions/VisualEditor"))
output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv"
write.csv(transformed_data, output_filepath, row.names = FALSE)