1
0
mw-lifecycle-analysis/analysis_data/roster_confirmation.R
2025-10-06 13:55:03 -07:00

108 lines
3.8 KiB
R

library(tidyverse)
library(jsonlite)
library(lubridate)
## TODO: get the within-case seniority
main_csv <-"~/analysis_data/100325_unified_phab.csv"
main_df <- read.csv(main_csv, header = TRUE)
main_df <- main_df %>%
mutate(source = factor(source, levels = c("c1", "c2", "c3")))
main_w_seniority <- main_df %>%
arrange(AuthorPHID, source, date_created) %>%
group_by(AuthorPHID) %>%
mutate(c2_c3_index = ifelse(source %in% c("c2", "c3"), row_number() - sum(source == "c1"), NA_integer_),
comment_index = case_when(
source == "c1" ~ row_number(),
source %in% c("c2", "c3") ~ c2_c3_index
)) %>%
ungroup() |>
select(-c2_c3_index)
# load in existing 'WMF in name' tags
WMF_in_name <- "~/analysis_data/022825_wmf_master_phab_roster.json"
old_roster <- fromJSON(WMF_in_name)
old_phids <- old_roster$phid
## TODO: reload roster and map onto the existing data
roster_csv <-"~/analysis_data/100625_wmf_roster_final.csv"
roster_df <- read.csv(roster_csv, header = TRUE)
# need to figure out the change to longitudinally situated bool
library(lubridate)
clean_end_date <- function(date_col) {
sapply(date_col, function(x) {
if (is.na(x) || x == "") {
# NA or empty: use current time
return(Sys.time())
} else if (grepl("^\\d{4}$", x)) {
# Only year: use 06/06 of that year
return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y"))
} else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) {
# MM/DD/YYYY: parse directly
return(as.POSIXct(x, format = "%m/%d/%Y"))
} else {
# Anything else, return NA or current time as fallback
return(Sys.time())
}
})
}
clean_start_date <- function(date_col) {
default_date <- as.POSIXct("06/20/2003", format = "%m/%d/%Y", tz = "UTC")
sapply(date_col, function(x) {
if (is.na(x) || x == "") {
return(default_date)
} else if (grepl("^\\d{4}$", x)) {
return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y", tz = "UTC"))
} else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) {
return(as.POSIXct(x, format = "%m/%d/%Y", tz = "UTC"))
} else {
return(default_date)
}
})
}
roster_df <- roster_df |>
mutate(
cleaned_start_date = clean_start_date(start.date),
cleaned_end_date = clean_end_date(end.date),
AuthorPHID = PhabricatorPHID
)
#gerrit bot: PHID-USER-idceizaw6elwiwm5xshb'
main_w_seniority_and_affil <- main_w_seniority %>%
left_join(roster_df, by = "AuthorPHID") %>%
mutate(
isAuthorWMF_prelim = (date_created >= cleaned_start_date & date_created <= cleaned_end_date) |
AuthorPHID %in% old_phids,
isGerritBot = AuthorPHID == 'PHID-USER-idceizaw6elwiwm5xshb') |>
replace_na(list(isAuthorWMF_prelim = FALSE)) |>
select(-name, -start.date, -end.date, -team,
-humanID, -phabricatorNick, -PhabricatorPHID, -X,
-notes, -sources, -X.1, -X.2, -cleaned_start_date, -cleaned_end_date)
#TODO: bugzilla, if from bugzilla and contains specific string, isAuthorWMF is true
#bzimport PHID: PHID-USER-ynivjflmc2dcl6w5ut5v
main_w_seniority_and_affil <- main_w_seniority_and_affil |>
mutate(
bugzilla_wmf = ifelse(
AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v" &
!is.na(bzName) & bzName != "" & bzName != "NA" &
!is.na(comment_text) & (
str_starts(comment_text, fixed(paste0("**Author:** `", bzName, "`"))) |
str_starts(comment_text, fixed(paste0("**", bzName, "** wrote:")))
),
TRUE, FALSE
),
isAuthorWMF = ifelse(
AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v",
bugzilla_wmf,
isAuthorWMF_prelim
)
) |>
select(-bugzilla_wmf, -isAuthorWMF_prelim, -bzName)
#NO roster affiliates found through the bugzilla matching
table(main_w_seniority_and_affil$isAuthorWMF)
write.csv(main_w_seniority_and_affil, "100625_unified_w_affil.csv", row.names = FALSE)