1
0
mw-lifecycle-analysis/121325_work/location_count_aggregation.R

127 lines
4.8 KiB
R

library(tidyverse)
library(dplyr)
library(lubridate)
library(stringr)
c1_event_date <- as.Date("2013-07-01")
c2_event_date <- as.Date("2013-08-28")
c3_event_date <- as.Date("2015-07-02")
relative_week <- function(date, ref_date) {
as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7)
}
contains_http_but_not_url <- function(text) {
# Handle NA values explicitly
ifelse(
is.na(text),
FALSE,
str_detect(text, "http") &
!str_detect(text, "://") |
str_detect(text, "login") |
str_detect(text, "ssl") |
str_detect(text, "tls") |
(str_detect(text, "cert") & !str_detect(text, "certain"))
)
}
#get count data for the repositories
#core
core_csv <-"~/121325_work/121225_vd_data/core_2010-01-01_to_2024-12-31.csv"
core_df <- read.csv(core_csv, header = TRUE)
core_df <- core_df |>
mutate(commit_date = ymd_hms(commit_date)) |>
mutate(code_location = case_when(
grepl("login", diff_info, ignore.case = TRUE) ~ "login_in_location",
grepl("auth", diff_info, ignore.case = TRUE) ~ "auth_in_location",
grepl("security", diff_info, ignore.case = TRUE) ~ "security",
TRUE ~ "Other"
))|>
mutate(isAuthorWMF = case_when(
grepl("krinkle@fastmail\\.com", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE",
grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit",
TRUE ~ "FALSE"
))
c1_core_weekly <- core_df |>
mutate(week_index = relative_week(commit_date, c1_event_date)) |>
group_by(week_index, isAuthorWMF, code_location)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -33 & week_index <= 13) |>
mutate(source = 'c1')
c2_core_weekly <- core_df |>
mutate(week_index = relative_week(commit_date, c2_event_date)) |>
group_by(week_index, isAuthorWMF, code_location)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -104 & week_index <= 13) |>
mutate(source = 'c2')
c3_core_weekly <- core_df |>
mutate(week_index = relative_week(commit_date, c3_event_date)) |>
group_by(week_index, isAuthorWMF, code_location)|>
summarise(count = n(), .groups = 'drop')|>
filter(week_index >= -83 & week_index <= 13) |>
mutate(source = 'c3')
#collate and save
core_counts <- rbind(c1_core_weekly, c2_core_weekly, c3_core_weekly)
core_counts <- core_counts |>
filter(isAuthorWMF != "Gerrit") |>
filter(code_location != "Other")
core_commits_created <- ggplot(
core_counts,
aes(
x=week_index,
y=count,
)
) +
facet_grid(source ~ .,
scales = "free_y",
labeller = labeller(source = c("c1" = "VisualEditor",
"c2" = "HTTPS-login",
"c3" = "HTTP-deprecation"))) +
geom_col(position = position_dodge(width = 0.9), width = 0.8) +
geom_vline(data = core_counts |> filter(source == "c1"),
aes(xintercept = -29),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = core_counts |> filter(source == "c1"),
aes(xintercept = -9),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = core_counts |> filter(source == "c1"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = core_counts |> filter(source == "c2"),
aes(xintercept = -99),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = core_counts |> filter(source == "c2"),
aes(xintercept = -4),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(data = core_counts |> filter(source == "c3"),
aes(xintercept = -97),
linetype = "dotted", color = "black", linewidth = 0.5) +
geom_vline(data = core_counts |> filter(source == "c3"),
aes(xintercept = -3),
linetype = "3313", color = "black", linewidth = 0.5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black", linewidth = 0.5) +
geom_text(
data = subset(core_counts, source == "c1" & week_index == 6),
aes(x=week_index, y=120, label='Opt-out deployment'),
size = 2.5) +
geom_text(
data = subset(core_counts, source == "c1" & week_index == -33),
aes(x=week_index, y=120, label='Opt-in Testing'),
size = 2.5) +
geom_text(
data = subset(core_counts, source == "c2" & week_index == -12),
aes(x=week_index, y=20, label='Deployment Announcement'),
size = 2.5) +
theme_minimal() +
scale_fill_viridis_d() +
labs(
x = "Weeks from Feature Deployment",
y = "Count of mediawiki/core commits Created",
fill = "Commit Author Affiliated with WMF?"
) +
theme(legend.position = "top")
core_commits_created