diff --git a/121325_work/misc.R b/121325_work/misc.R index 5ce8ded..94a294b 100644 --- a/121325_work/misc.R +++ b/121325_work/misc.R @@ -8,14 +8,19 @@ relative_week <- function(date, ref_date) { as.integer(as.numeric(difftime(date, ref_date, units = "days")) %/% 7) } -core_csv <-"~/121325_work/121225_vd_data/core_2010-01-01_to_2024-12-31.csv" +core_csv <-"~/121325_work/121225_vd_data/extension_VisualEditor_2000-01-01_to_2016-12-31.csv" core_df <- read.csv(core_csv, header = TRUE) +known_affil_emails <- c("krinkle@fastmail.com", "roan.kattouw@gmail.com", + "trevorparscal@gmail.com", "krinklemail@gmail.com", "moriel@gmail.com") +active_names<- c("Timo Tijhof", "Krinkle", "Roan Kattouw", "Catrope", + "Trevor Parscal", "Ed Sanders") core_df <- core_df |> mutate(commit_date = ymd_hms(commit_date)) |> mutate(isAuthorWMF = case_when( - grepl("krinkle@fastmail\\.com", author_email, ignore.case = TRUE) ~ "TRUE", + author_name %in% active_names ~ "FIVE", grepl("@wikimedia\\.org", author_email, ignore.case = TRUE) ~ "TRUE", grepl("@wikimedia\\.de", author_email, ignore.case = TRUE) ~ "TRUE", + grepl("l10n-bot@translatewiki\\.net", author_email, ignore.case = TRUE) ~ "localization", grepl("@gerrit\\.wikimedia\\.org", author_email, ignore.case = TRUE) ~ "Gerrit", TRUE ~ "FALSE" )) |> @@ -27,10 +32,41 @@ core_df <- core_df |> c1_core_weekly <- core_df |> mutate(week_index = relative_week(commit_date, c1_event_date)) |> - group_by(week_index, isVE)|> + group_by(week_index, isAuthorWMF)|> summarise(count = n(), .groups = 'drop')|> - filter(week_index >= -33 & week_index <= 13) |> + filter(week_index >= -9 & week_index < -4) |> mutate(source = 'c1') +c1summary <- c1_core_weekly |> + group_by(isAuthorWMF)|> + summarize(total = sum(count)) + + +c2_core_weekly <- core_df |> + mutate(week_index = relative_week(commit_date, c2_event_date)) |> + group_by(week_index, isAuthorWMF)|> + summarise(count = n(), .groups = 'drop')|> + filter(week_index >= -104 & week_index <= 13) |> + mutate(source = 'c2') + +c3_core_weekly <- core_df |> + mutate(week_index = relative_week(commit_date, c3_event_date)) |> + group_by(week_index, isAuthorWMF)|> + summarise(count = n(), .groups = 'drop')|> + filter(week_index >= -83 & week_index <= 13) |> + mutate(source = 'c3') +#collate and save +core_weekly <- rbind(c1_core_weekly, c2_core_weekly, c3_core_weekly) + +c1summary <- c1_core_weekly |> + group_by(isAuthorWMF)|> + summarize(total = sum(count)) + +c2summary <- c2_core_weekly |> + group_by(isAuthorWMF)|> + summarize(total = sum(count)) + +c3summary <- c3_core_weekly |> + group_by(isAuthorWMF)|> + summarize(total = sum(count)) + -true_ <- c1_core_weekly |> - filter(isVE == TRUE) diff --git a/analysis_data/scratch.R b/analysis_data/scratch.R index 218ba54..493d6c0 100644 --- a/analysis_data/scratch.R +++ b/analysis_data/scratch.R @@ -2,6 +2,29 @@ library(tidyverse) main_csv <-"~/analysis_data/120725_unified.csv" main_df <- read.csv(main_csv, header = TRUE) +bz_summary <- main_df |> + mutate(isBz = if_else( + AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v", TRUE, FALSE + )) |> + group_by(source, comment_type)|> + summarise(count = n(), .groups = 'drop') + + + + + + + + + + + + + + + + + dsl_csv <-"~/dsl/120725_DSL_frame.csv" dsl_df <- read.csv(dsl_csv, header = TRUE)