From 7ce420ce20fbea712696854849a8a46c50c7d656 Mon Sep 17 00:00:00 2001 From: Matthew Gaughan Date: Fri, 14 Mar 2025 14:29:08 -0700 Subject: [PATCH] updating to count the commits to bot framework libraries --- .sh_history | 36 +++++++ .../0314-ve-core-testing-new-commits.png | Bin .../case1/0314-ve-ve-testing-new-commits.png | Bin commit_analysis/framework_commit_collation.R | 90 ++++++++++++++++++ mgaughan-rstudio-server_24842187.out | 18 ++++ 5 files changed, 144 insertions(+) rename 0314-ve-core-testing-new-commits.png => commit_analysis/case1/0314-ve-core-testing-new-commits.png (100%) rename 0314-ve-ve-testing-new-commits.png => commit_analysis/case1/0314-ve-ve-testing-new-commits.png (100%) create mode 100644 commit_analysis/framework_commit_collation.R create mode 100644 mgaughan-rstudio-server_24842187.out diff --git a/.sh_history b/.sh_history index 8d2b0f5..787e8e6 100644 --- a/.sh_history +++ b/.sh_history @@ -29,3 +29,39 @@ l ls rm en-testing_0312_mediawiki_core_weekly_commit_count_data.csv ls +rm wide-testing_0217_extensions_ve_weekly_commit_count_data.csv +rm wide-testing_0217_mediawiki_core_weekly_commit_count_data.csv +rm event_0217_mediawiki_core_weekly_commit_count_data.csv +rm event_0217_extensions_ve_weekly_commit_count_data.csv +rm en-testing_0217_extensions_ve_weekly_commit_count_data.csv +rm en-testing_0217_mediawiki_core_weekly_commit_count_data.csv +ls +ls .. +ls ../commit_data +ls ../commit_data/bot_frameworks +ls +cd .. +ls +rm -r commit_data/bot_frameworks +ls +cd commit_data +ls +cd .. +ls +rm -r -f commit_data +ls +cd commit_data +ls +cd bot_frameworks +ls +pwd +mv pywikibot_2010-01-01_to_2024-12-31.csv pywikibot_commits.csv +ls +cd .. +ls +cd .. +ls +cd case1 +ls +mv event_0314_bot_frameworks_weekly_commit_count_data.csv en-testing_0314_bot_frameworks_weekly_commit_count_data.csv +ls diff --git a/0314-ve-core-testing-new-commits.png b/commit_analysis/case1/0314-ve-core-testing-new-commits.png similarity index 100% rename from 0314-ve-core-testing-new-commits.png rename to commit_analysis/case1/0314-ve-core-testing-new-commits.png diff --git a/0314-ve-ve-testing-new-commits.png b/commit_analysis/case1/0314-ve-ve-testing-new-commits.png similarity index 100% rename from 0314-ve-ve-testing-new-commits.png rename to commit_analysis/case1/0314-ve-ve-testing-new-commits.png diff --git a/commit_analysis/framework_commit_collation.R b/commit_analysis/framework_commit_collation.R new file mode 100644 index 0000000..18382a7 --- /dev/null +++ b/commit_analysis/framework_commit_collation.R @@ -0,0 +1,90 @@ +library(tidyverse) +library(purrr) +library(readr) +library(stringr) +library(lubridate) +library(tidyr) + +data_dir = "/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/bot_frameworks" + +csv_files <- list.files(data_dir, pattern = "*.csv", full.names = TRUE) + +read_and_label <- function(file) { + project_name <- basename(file) %>% + stringr::str_remove("_commits.csv") + read_csv(file) %>% + mutate(project = project_name) +} + +all_data <- csv_files %>% + map_df(read_and_label) + +# TODO: this is project/event specific +event_date <- as.Date("2013-07-01") +#event_date <- as.Date("2013-04-25") +#event_date <- as.Date("2012-12-11") + +df <- all_data |> + mutate(commit_date = ymd_hms(commit_date)) + +df <- df %>% + group_by(project) %>% + mutate(oldest_commit_date = min(as.Date(commit_date))) %>% + ungroup() %>% + mutate(age = as.numeric(as.Date("2025-02-10") - oldest_commit_date)) + +filtered_df <- df %>% + group_by(project) %>% + filter(min(as.Date(commit_date)) <= event_date) %>% + ungroup() + +calculated_start_date <- event_date %m-% months(12) +start_date <- max(calculated_start_date, df$oldest_commit_date) +end_date <- event_date %m+% months(12) + +#getting the relative weeks to the publication date +relative_week <- function(date, ref_date) { + as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) +} + +filtered_df <- filtered_df |> + mutate(relative_week = relative_week(commit_date, event_date)) |> + arrange(relative_week) |> + group_by(author_email) |> + mutate(new_author = ifelse(row_number() <= 5, 1, 0), + new_author_wmf = if_else(grepl("@wikimedia", author_email), new_author, 0), + new_author_unaff = if_else(!grepl("@wikimedia", author_email), new_author, 0)) |> + ungroup() + + +weekly_commits <- filtered_df |> + group_by(project, relative_week, age) |> + summarise(commit_count = n(), + author_emails = list(unique(author_email)), + committer_emails = list(unique(committer_email)), + mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)), + wikimedia_commit_count = sum(grepl("@wikimedia", author_email)), + wikia_commit_count = sum(grepl("@wikia-inc.com", author_email)), + bot_commit_count = sum(grepl("l10n-bot@translatewiki.net|tools.libraryupgrader@tools.wmflabs.org", author_email)), + wmf_ft_commit_count = sum(new_author_wmf), + unaff_ft_commit_count = sum(new_author_unaff), + .groups = 'drop') |> + replace_na(list(commit_count = 0)) |> + replace_na(list(wikimedia_commit_count = 0)) |> + replace_na(list(l10n_commit_count = 0)) |> + replace_na(list(jenkins_commit_count = 0)) |> + replace_na(list(mediawiki_dev_commit_count = 0)) |> + replace_na(list(wikia_commit_count = 0)) |> + replace_na(list(wmf_ft_commit_count = 0)) |> + replace_na(list(unaff_ft_commit_count = 0)) |> + mutate(before_after = if_else(relative_week < 0, 0, 1)) |> + select(-author_emails, -committer_emails) + + +weekly_commits <- weekly_commits |> + filter(relative_week >= (-52) & relative_week <= 52 ) + +weekly_commits + +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/case1/event_0314_bot_frameworks_weekly_commit_count_data.csv" +write.csv(weekly_commits, output_filepath, row.names = FALSE) \ No newline at end of file diff --git a/mgaughan-rstudio-server_24842187.out b/mgaughan-rstudio-server_24842187.out new file mode 100644 index 0000000..7396df6 --- /dev/null +++ b/mgaughan-rstudio-server_24842187.out @@ -0,0 +1,18 @@ +1. SSH tunnel from your workstation using the following command: + + ssh -N -L 8787:n3439:32903 mjilg@klone.hyak.uw.edu + + and point your web browser to http://localhost:8787 + +2. log in to RStudio Server using the following credentials: + + user: mjilg + password: bkiSrTlWE0y9QQnCxd2p + +When done using RStudio Server, terminate the job by: + +1. Exit the RStudio Session ("power" button in the top right corner of the RStudio window) +2. Issue the following command on the login node: + + scancel -f 24842187 +slurmstepd: error: *** JOB 24842187 ON n3439 CANCELLED AT 2025-03-14T14:28:50 ***