diff --git a/.RData b/.RData index 8a3c7be..0ccd43c 100644 Binary files a/.RData and b/.RData differ diff --git a/.sh_history b/.sh_history new file mode 100644 index 0000000..4a6ea5d --- /dev/null +++ b/.sh_history @@ -0,0 +1,10 @@ +cd /mmfs1/gscratch/comdata/users/mjilg +ls +cd mw-repo-lifecycles +ls +cd 0205_convo_data +ls +cd .. +ls +ls commit_data +ls commit_data/visualeditor diff --git a/commit_analysis/commit_count_collation.R b/commit_analysis/commit_count_collation.R new file mode 100644 index 0000000..c12d3b4 --- /dev/null +++ b/commit_analysis/commit_count_collation.R @@ -0,0 +1,133 @@ +library(tidyverse) +library(dplyr) +library(lubridate) + +ve_commit_fp <- "/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/visualeditor/VisualEditor_2012-01-01_to_2014-12-31.csv" + +transform_commit_data <- function(filepath){ + #basic, loading in the file + df = read.csv(filepath, header = TRUE) + temp_df <- df + dir_path = dirname(filepath) + file_name = basename(filepath) + + + # TODO: this is project/event specific + event_date <- as.Date("2013-07-01") + + # isolate project id + project_id <- sub("_.*$", "", file_name) + + #make sure the dates are formatted correctly and state the project_id + df <- df |> + mutate(commit_date = ymd_hms(commit_date)) |> + mutate(project_id = project_id) + + #get information about project age either in the "present" + #or at the time of first commit + oldest_commit_date <- min(as.Date(df$commit_date)) + project_age <- as.numeric(as.Date("2025-02-10") - oldest_commit_date) + + #add that to the data + df <- df |> + mutate(age = project_age) + + #we are looking at weekly data, 6m before and 6m after + #start_date <- event_date %m-% months(6) + calculated_start_date <- event_date %m-% months(24) + start_date <- max(calculated_start_date, oldest_commit_date) + end_date <- event_date %m+% months(24) + + #getting the relative weeks to the publication date + relative_week <- function(date, ref_date) { + as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) + } + + df <- df |> + mutate(relative_week = relative_week(commit_date, event_date)) + + #filler for when there are weeks without commits + all_weeks <- seq(relative_week(start_date, event_date), relative_week(end_date, event_date)) + complete_weeks_df <- expand.grid(relative_week = all_weeks, + project_id = project_id, + age = project_age) + + #for each week, get the list of unique authors that committed + cumulative_authors <- df %>% + arrange(relative_week) %>% + group_by(relative_week) %>% + summarize(cumulative_author_emails = list(unique(author_email)), .groups = 'drop') + #same for each committer + cumulative_committers <- df %>% + arrange(relative_week) %>% + group_by(relative_week) %>% + summarize(cumulative_committer_emails = list(unique(committer_email)), .groups = 'drop') + + #now cut out the commit data that we don't care about + df <- df |> + filter(as.Date(event_date) >= start_date & as.Date(event_date) <= end_date) + + #in order: + # - we group by project, week, ages + # - and we summarize commit and authorship details + # - we then fill in information for missingness + # - and add in vars for before/after + # - and weekly index + weekly_commits <- df |> + group_by(project_id, relative_week, age) |> + summarise(commit_count = n(), + author_emails = list(unique(author_email)), + committer_emails = list(unique(committer_email)), + mediawiki_dev_commit_count = sum(grepl("@users.mediawiki.org", author_email)), + wikimedia_commit_count = sum(grepl("@wikimedia.org", author_email)), + l10n_commit_count = sum(grepl("l10n-bot@translatewiki.net", author_email)), + jenkins_commit_count = sum(grepl("@gerrit.wikimedia.org", author_email)), + .groups = 'drop') |> + right_join(complete_weeks_df, by=c("relative_week", "project_id", "age")) |> + replace_na(list(commit_count = 0)) |> + replace_na(list(wikimedia_commit_count = 0)) |> + replace_na(list(l10n_commit_count = 0)) |> + replace_na(list(jenkins_commit_count = 0)) |> + replace_na(list(mediawiki_dev_commit_count = 0)) |> + mutate(before_after = if_else(relative_week < 0, 0, 1)) + # then, to get the authorship details in + # we check if the email data is present, if not we fill in blank + # we bring in the information about authorship lists that we already had + # then comparing the current week's author list with the previous week's cumulative list, or empty + # ---- the length of that difference is the 'new' value + # then we delete out the author list information + weekly_with_authorship <- weekly_commits |> + mutate( + author_emails = ifelse(is.na(author_emails), list(character()), author_emails), + committer_emails = ifelse(is.na(committer_emails), list(character()), committer_emails) + ) |> + left_join(cumulative_authors, by = "relative_week") |> + left_join(cumulative_committers, by = "relative_week") |> + mutate(new_author_emails = mapply(function(x, y) length(setdiff(x, y)), author_emails, lag(cumulative_author_emails, default = list(character(1)))), + new_committer_emails = mapply(function(x, y) length(setdiff(x, y)), committer_emails, lag(cumulative_committer_emails, default = list(character(1))))) + + weekly_with_authorship <- weekly_with_authorship |> + mutate( + wikimedia_author_emails = mapply(function(x) length(grep("@wikimedia.org", x)), author_emails), + non_wikimedia_author_emails = mapply(function(x) length(x) - length(grep("@wikimedia.org", x)), author_emails), + wikimedia_committer_emails = mapply(function(x) length(grep("@wikimedia.org", x)), committer_emails), + non_wikimedia_committer_emails = mapply(function(x) length(x) - length(grep("@wikimedia.org", x)), committer_emails), + new_wikimedia_authors = mapply(function(x, y) length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), + new_non_wikimedia_authors = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), author_emails, lag(cumulative_author_emails, default = list(character(1)))), + new_wikimedia_committers = mapply(function(x, y) length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))), + new_non_wikimedia_committers = mapply(function(x, y) length(setdiff(x, y)) - length(setdiff(grep("@wikimedia.org", x, value = TRUE), grep("@wikimedia.org", y, value = TRUE))), committer_emails, lag(cumulative_committer_emails, default = list(character(1)))) + ) |> + select(-author_emails, -committer_emails, -cumulative_author_emails, -cumulative_committer_emails) + + + #gracefully exit + return(weekly_with_authorship) +} + +test <- read.csv(ve_commit_fp, header = TRUE) +transformed <- transform_commit_data(ve_commit_fp) +output_filepath <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/visualeditor/0210_ve_weekly_count_data.csv" + + +write.csv(transformed, output_filepath, row.names = FALSE) + diff --git a/commit_analysis/mlm.R b/commit_analysis/mlm.R new file mode 100644 index 0000000..20fe625 --- /dev/null +++ b/commit_analysis/mlm.R @@ -0,0 +1,18 @@ +count_data_fp <-"/mmfs1/gscratch/comdata/users/mjilg/mw-repo-lifecycles/commit_data/visualeditor/0210_ve_weekly_count_data.csv" +input_df <- read_csv(count_data_fp) + +input_df$nonbot_commit_count <- input_df$commit_count - input_df$l10n_commit_count - input_df$jenkins_commit_count + +input_df <- input_df |> + filter(relative_week < 79) +library(scales) +library(ggplot2) + +time_plot <- input_df |> + ggplot(aes(x=relative_week, y=wikimedia_commit_count)) + + labs(x="Weekly Offset", y="WMF Commit Count") + + geom_smooth() + + geom_vline(xintercept = 0)+ + theme_bw() + + theme(legend.position = "top") +time_plot