1
0
mw-lifecycle-analysis/121325_work/ve-commit-comparison.R
2026-01-19 15:04:38 -08:00

119 lines
4.4 KiB
R

library(tidyverse)
library(dplyr)
library(lubridate)
relative_week <- function(date, ref_date) {
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
}
core_csv <-"~/121325_work/121225_vd_data/core_2010-01-01_to_2024-12-31.csv"
core_df <- read.csv(core_csv, header = TRUE)
core_df <- core_df |> mutate(repo = "core")
ve_csv <-"~/121325_work/121225_vd_data/extension_VisualEditor_2000-01-01_to_2016-12-31.csv"
ve_df <- read.csv(ve_csv, header = TRUE)
ve_df <- ve_df |> mutate(repo = "ve")
joint_df <- rbind(core_df, ve_df)
top_four_emails <- ve_df %>%
filter(commit_date >= as.Date("2012-11-11") & commit_date <= as.Date("2013-09-29") ) |>
group_by(author_email) %>%
summarise(email_count = n(), .groups = "drop") %>%
arrange(desc(email_count)) %>%
slice_head(n = 7)
known_affil_emails <- c("krinkle@fastmail.com", "roan.kattouw@gmail.com",
"trevorparscal@gmail.com", "krinklemail@gmail.com", "moriel@gmail.com")
active_names<- c("Timo Tijhof", "Krinkle", "Roan Kattouw", "Catrope",
"Trevor Parscal", "Ed Sanders", "Moriel Schottlender", "Gabriel Wicke", "C. Scott Ananian")
#jforrester@wikimedia.org
#(author_name %in% active_names) ~ "ActiveEmails",
joint_df <- joint_df |>
mutate(commit_date = ymd_hms(commit_date)) |>
mutate(isAuthorWMF = case_when(
(author_email %in% known_affil_emails) ~ "TRUE",
grepl("l10n-bot@translatewiki\\.net", author_email, ignore.case = TRUE) ~ "localization",
grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit",
TRUE ~ "FALSE"
))
library(tidyr)
authors_in_both <- joint_df |>
group_by(author_email, repo) |>
summarise(commit_count = n(), .groups = "drop") |>
pivot_wider(names_from = repo, values_from = commit_count, values_fill = 0) |>
filter(core > 0 & ve > 0) |>
pull(author_email)
joint_df_filtered <- joint_df |>
filter(author_email %in% authors_in_both)
c1_event_date <- as.Date("2013-07-01")
c1_weekly <- joint_df |>
mutate(week_index = relative_week(commit_date, c1_event_date)) |>
group_by(week_index, isAuthorWMF, repo)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -33 & week_index <= 13) |>
mutate(source = 'c1')
counts <- c1_weekly |>
filter(isAuthorWMF != "Gerrit")|>
filter(isAuthorWMF != "localization")|>
filter(source == 'c1')
#counts <- c1_core_weekly |>
# filter(isAuthorWMF == "ActiveEmails") |>
# filter(source == 'c1')
commits_created <-
ggplot(
counts,
aes(
x=week_index,
y=count,
fill=isAuthorWMF,
)
) +
facet_grid(repo ~ .,
labeller = labeller(repo = c("ve" = "extensions/visualeditor",
"core" = "mediawiki/core"))) +
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
geom_vline(data = counts |> filter(source == "c1"),
aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = counts |> filter(source == "c1"),
aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = counts |> filter(source == "c1"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
geom_text(
data = subset(counts, repo == "core" & week_index == 4),
aes(x=week_index, y=75, label='Opt-out deployment'),
size = 3) +
geom_text(
data = subset(counts, repo == "core" & week_index == -27),
aes(x=week_index, y=75, label='Opt-in Testing'),
size = 3) +
geom_text(
data = subset(counts, repo == "core" & week_index == -8),
aes(x=week_index, y=75, label='Deployment Announcement'),
size = 3) +
theme_minimal() +
scale_fill_viridis_d(
breaks = c("FALSE", "TRUE", "BzImport"),
labels = c("External Contributor", "WMF-affiliate", "BzImport")
) +
labs(
x = "Weeks from VisualEditor Deployment on Wikipedia",
y = "Count of New Commits",
fill = "Commit Author"
) +
theme(legend.position = "top")
commits_created
ggsave(
filename = "011925_ve_commits_created.png",
plot = commits_created,
width = 8, # inches
height = 4, # inches
dpi = 800 # high resolution
)