1
0
mw-lifecycle-analysis/121325_work/count_aggregation.R

145 lines
6.2 KiB
R

library(tidyverse)
library(dplyr)
library(lubridate)
c1_event_date <- as.Date("2013-07-01")
c2_event_date <- as.Date("2013-08-28")
c3_event_date <- as.Date("2015-07-02")
relative_week <- function(date, ref_date) {
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
}
known_affil_emails <- c("krinkle@fastmail.com", "roan.kattouw@gmail.com",
"trevorparscal@gmail.com", "krinklemail@gmail.com", "moriel@gmail.com")
#get count data for the repositories
#core
core_csv <-"~/121325_work/121225_vd_data/core_2010-01-01_to_2024-12-31.csv"
core_df <- read.csv(core_csv, header = TRUE)
core_df <- core_df |>
mutate(commit_date = ymd_hms(commit_date)) |>
mutate(isAuthorWMF = case_when(
(author_email %in% known_affil_emails) ~ "TRUE",
grepl("l10n-bot@translatewiki\\.net", author_email, ignore.case = TRUE) ~ "localization",
grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit",
TRUE ~ "FALSE"
))
c1_core_weekly <- core_df |>
mutate(week_index = relative_week(commit_date, c1_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -33 & week_index <= 13) |>
mutate(source = 'c1')
c2_core_weekly <- core_df |>
mutate(week_index = relative_week(commit_date, c2_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -104 & week_index <= 13) |>
mutate(source = 'c2')
c3_core_weekly <- core_df |>
mutate(week_index = relative_week(commit_date, c3_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -83 & week_index <= 13) |>
mutate(source = 'c3')
#collate and save
core_weekly <- rbind(c1_core_weekly, c2_core_weekly, c3_core_weekly)
write.csv(core_weekly, "~/121325_work/aggregate_dfs/121325_core_weekly.csv", row.names = FALSE)
#operations
wmfconfig_csv <-"~/121325_work/121225_vd_data/mediawiki-config_2010-01-01_to_2024-12-31.csv"
wmfconfig_df <- read.csv(wmfconfig_csv, header = TRUE)
wmfconfig_df <- wmfconfig_df |>
mutate(commit_date = ymd_hms(commit_date)) |>
mutate(isAuthorWMF = case_when(
(author_email %in% known_affil_emails) ~ "TRUE",
grepl("l10n-bot@translatewiki\\.net", author_email, ignore.case = TRUE) ~ "localization",
grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit",
TRUE ~ "FALSE"
))
c1_config_weekly <- wmfconfig_df |>
mutate(week_index = relative_week(commit_date, c1_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -33 & week_index <= 13) |>
mutate(source = 'c1')
c2_config_weekly <- wmfconfig_df |>
mutate(week_index = relative_week(commit_date, c2_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -104 & week_index <= 13) |>
mutate(source = 'c2')
c3_config_weekly <- wmfconfig_df |>
mutate(week_index = relative_week(commit_date, c3_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -83 & week_index <= 13) |>
mutate(source = 'c3')
#collate and save
config_weekly <- rbind(c1_config_weekly, c2_config_weekly, c3_config_weekly)
write.csv(config_weekly, "~/121325_work/aggregate_dfs/121325_config_weekly.csv", row.names = FALSE)
#puppet
puppet_csv <-"~/121325_work/121225_vd_data/puppet_2000-01-01_to_2016-12-31.csv"
puppet_df <- read.csv(puppet_csv, header = TRUE)
puppet_df <- puppet_df |>
mutate(commit_date = ymd_hms(commit_date)) |>
mutate(isAuthorWMF = case_when(
(author_email %in% known_affil_emails) ~ "TRUE",
grepl("l10n-bot@translatewiki\\.net", author_email, ignore.case = TRUE) ~ "localization",
grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit",
TRUE ~ "FALSE"
))
c1_puppet_weekly <- puppet_df |>
mutate(week_index = relative_week(commit_date, c1_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -33 & week_index <= 13) |>
mutate(source = 'c1')
c2_puppet_weekly <- puppet_df |>
mutate(week_index = relative_week(commit_date, c2_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -104 & week_index <= 13) |>
mutate(source = 'c2')
c3_puppet_weekly <- puppet_df |>
mutate(week_index = relative_week(commit_date, c3_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -83 & week_index <= 13) |>
mutate(source = 'c3')
puppet_weekly <- rbind(c1_puppet_weekly, c2_puppet_weekly, c3_puppet_weekly)
write.csv(puppet_weekly, "~/121325_work/aggregate_dfs/121325_puppet_weekly.csv", row.names = FALSE)
#extension_ve
ve_csv <-"~/121325_work/121225_vd_data/extension_VisualEditor_2000-01-01_to_2016-12-31.csv"
ve_df <- read.csv(ve_csv, header = TRUE)
ve_df <- ve_df |>
mutate(commit_date = ymd_hms(commit_date)) |>
mutate(isAuthorWMF = case_when(
(author_email %in% known_affil_emails) ~ "TRUE",
grepl("l10n-bot@translatewiki\\.net", author_email, ignore.case = TRUE) ~ "localization",
grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit",
TRUE ~ "FALSE"
))
c1_ve_weekly <- ve_df |>
mutate(week_index = relative_week(commit_date, c1_event_date)) |>
group_by(week_index, isAuthorWMF)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -33 & week_index <= 13) |>
mutate(source = 'c1')
write.csv(c1_ve_weekly, "~/121325_work/aggregate_dfs/121325_ve_weekly.csv", row.names = FALSE)