From 7f8b885ef4c2b32c02af283f4f658eb61f0d784f Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Sun, 16 Feb 2025 14:08:16 -0800 Subject: [PATCH] initial gerrit analysis for ve --- .../bot_activity_exploration.R | 21 +++--- commit_analysis/commit_count_collation.R | 4 +- gerrit_analysis/make_gerrit_count_data.R | 65 +++++++++++++++++++ gerrit_analysis/plotting_gerrit.R | 61 +++++++++++++++++ mgaughan-rstudio-server_24263466.out | 18 ----- 5 files changed, 139 insertions(+), 30 deletions(-) create mode 100644 gerrit_analysis/make_gerrit_count_data.R create mode 100644 gerrit_analysis/plotting_gerrit.R delete mode 100644 mgaughan-rstudio-server_24263466.out diff --git a/bot_activity_analysis/bot_activity_exploration.R b/bot_activity_analysis/bot_activity_exploration.R index 561f6d8..aff7b3b 100644 --- a/bot_activity_analysis/bot_activity_exploration.R +++ b/bot_activity_analysis/bot_activity_exploration.R @@ -59,24 +59,25 @@ count_mean <- mean(combined_df$count) #628.119 count_var_to_mean <- count_var / count_mean #2262140.471 # the mean count values for each day -summary_df <- combined_df |> +wiki_summary_df <- combined_df |> + filter(date >= as.Date("2013-01-01") & date <= as.Date("2013-12-31")) |> + filter(wiki_db == "enwiki") |> group_by(date) |> summarize( - mean_count = mean(count), + sum_count = sum(count), median_count = median(count) ) #plotting it -p1 <- ggplot(summary_df, aes(x = date, y = median_count)) + - geom_line(color = "blue") + # Line plot - geom_point(color = "red") + # Points on the line +p1 <- ggplot(wiki_summary_df, aes(x = date, y = sum_count)) + + geom_line(color = "blue") + # Line plot # Points on the line geom_vline(xintercept = as.Date("2013-07-01"), linetype = "dashed", color = "black") + - labs(title = "Median Bot Actions", - x = "Date", - y = "Median Count") + + labs(title = "enwiki Total Bot Actions", + x = "Date (daily)", + y = "Action Count") + theme_minimal() p1 -p1_5 <- ggplot(summary_df, aes(x = date)) + - geom_smooth(aes(y = median_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + +p1_5 <- ggplot(enwiki_summary_df, aes(x = date)) + + geom_smooth(aes(y = sum_count), method = "loess", color = "red", fill = "red", alpha = 0.2, se = TRUE) + #geom_point(color = "red") + # Points on the line labs(title = "Median Bot Actions", x = "Date", diff --git a/commit_analysis/commit_count_collation.R b/commit_analysis/commit_count_collation.R index b3f049c..177d226 100644 --- a/commit_analysis/commit_count_collation.R +++ b/commit_analysis/commit_count_collation.R @@ -36,9 +36,9 @@ transform_commit_data <- function(filepath){ #we are looking at weekly data, 6m before and 6m after #start_date <- event_date %m-% months(6) - calculated_start_date <- event_date %m-% months(24) + calculated_start_date <- event_date %m-% months(12) start_date <- max(calculated_start_date, oldest_commit_date) - end_date <- event_date %m+% months(24) + end_date <- event_date %m+% months(12) #getting the relative weeks to the publication date relative_week <- function(date, ref_date) { diff --git a/gerrit_analysis/make_gerrit_count_data.R b/gerrit_analysis/make_gerrit_count_data.R new file mode 100644 index 0000000..19e1c46 --- /dev/null +++ b/gerrit_analysis/make_gerrit_count_data.R @@ -0,0 +1,65 @@ +library(tidyverse) + +gerrit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/0214_ve_gerrit_test.csv" +input_df <- read.csv(gerrit_fp, header = TRUE) + +transform_gerrit_data <- function(data_fp, projects){ + #loading in + df <- read.csv(data_fp, header = TRUE) + #some initial pre-processing + df <- df |> + filter(project %in% projects) |> + mutate(created = ymd_hms(created)) |> + mutate(updated = ymd_hms(updated)) |> + mutate(submitted = ymd_hms(submitted)) + + event_date <- as.Date("2013-07-01") + + oldest_created_date <- min(as.Date(df$created)) + calculated_start_date <- event_date %m-% months(24) + start_date <- max(calculated_start_date, oldest_created_date) + end_date <- event_date %m+% months(24) + + relative_week <- function(date, ref_date) { + as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) + } + + #going off of the created date + df <- df |> + mutate(relative_week = relative_week(created, event_date)) |> + mutate(create_update_delta = as.numeric(difftime(updated, created, units="days"))) + + + #creating filler zeros + unique_projects <- unique(df$project) + unique_statuses <- unique(df$status) + all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) + complete_weeks_df <- expand.grid( + project = unique_projects, + relative_week = all_weeks, + status = unique_statuses + ) + + weekly_commits <- df |> + group_by(project, relative_week, status) |> + summarise(task_count = n(), + avg_resolution_time = mean(create_update_delta, na.rm = TRUE), + avg_insertions = mean(insertions, na.rm=TRUE), + avg_deletions = mean(deletions, na.rm=TRUE), + .groups = 'drop') |> + right_join(complete_weeks_df, + by=c("project", "relative_week", "status")) |> + replace_na(list(task_count = 0)) |> + replace_na(list(avg_resolution_time = 0 )) + + return(weekly_commits) +} + +transformed_data <- transform_gerrit_data(gerrit_fp, c("VisualEditor/VisualEditor", + "mediawiki/extensions", + "mediawiki/core", + "mediawiki/extensions/VisualEditor")) + +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv" + +write.csv(transformed_data, output_filepath, row.names = FALSE) \ No newline at end of file diff --git a/gerrit_analysis/plotting_gerrit.R b/gerrit_analysis/plotting_gerrit.R new file mode 100644 index 0000000..7322f5e --- /dev/null +++ b/gerrit_analysis/plotting_gerrit.R @@ -0,0 +1,61 @@ +library(tidyverse) +count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0216_ve_gerrit_count.csv" +input_df <- read.csv(count_data_fp, header = TRUE) + + +#window_num <- 19 +window_num <- 52 +input_df <- input_df |> + filter(relative_week >= (- window_num) & relative_week <= (window_num)) |> + mutate(parent_projects = if_else(project == "mediawiki/extensions/VisualEditor", + "mediawiki/extensions", project)) + +library(scales) +library(ggplot2) + +time_plot <- input_df |> + ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + + labs(x="Weekly Offset", y="New Gerrit Tasks Created", color = "Project") + + geom_smooth() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot + + +abandoned_df <- input_df |> + filter(status == "ABANDONED") +time_plot <- abandoned_df |> + ggplot(aes(x=relative_week, y=task_count, color=parent_projects)) + + labs(x="Weekly Offset", y="AbandonedGerrit Tasks Created", color = "Project") + + geom_line() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot + + +delta_df <- input_df |> + filter(task_count != 0) |> + filter(relative_week >= (- 12)) +time_plot <- delta_df |> + ggplot(aes(x=relative_week, y=avg_resolution_time, color=parent_projects)) + + labs(x="Weekly Offset", y="Avg. (weekly) Time from task creation to last update (days)", color = "Project") + + geom_line() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot + + +loc_df <- input_df |> + filter(task_count != 0) |> + filter(status != "ABANDONED") +time_plot <- loc_df |> + ggplot(aes(x=relative_week, y=avg_deletions, color=parent_projects)) + + labs(x="Weekly Offset", y="Avg. LOC Deleted per Accepted Gerrit Task", color = "Project") + + geom_line() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot diff --git a/mgaughan-rstudio-server_24263466.out b/mgaughan-rstudio-server_24263466.out deleted file mode 100644 index d0cf93f..0000000 --- a/mgaughan-rstudio-server_24263466.out +++ /dev/null @@ -1,18 +0,0 @@ -1. SSH tunnel from your workstation using the following command: - - ssh -N -L 8787:n3439:56403 mjilg@klone.hyak.uw.edu - - and point your web browser to http://localhost:8787 - -2. log in to RStudio Server using the following credentials: - - user: mjilg - password: 8aREIl1dg0jceqBTNl9a - -When done using RStudio Server, terminate the job by: - -1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) -2. Issue the following command on the login node: - - scancel -f 24263466 -slurmstepd: error: *** JOB 24263466 ON n3439 CANCELLED AT 2025-02-15T16:52:54 ***