library(tidyverse) library(jsonlite) library(lubridate) ## TODO: get the within-case seniority main_csv <-"~/analysis_data/100325_unified_phab.csv" main_df <- read.csv(main_csv, header = TRUE) main_df <- main_df %>% mutate(source = factor(source, levels = c("c1", "c2", "c3"))) main_w_seniority <- main_df %>% arrange(AuthorPHID, source, date_created) %>% group_by(AuthorPHID) %>% mutate(c2_c3_index = ifelse(source %in% c("c2", "c3"), row_number() - sum(source == "c1"), NA_integer_), comment_index = case_when( source == "c1" ~ row_number(), source %in% c("c2", "c3") ~ c2_c3_index )) %>% ungroup() |> select(-c2_c3_index) # load in existing 'WMF in name' tags WMF_in_name <- "~/analysis_data/022825_wmf_master_phab_roster.json" old_roster <- fromJSON(WMF_in_name) old_phids <- old_roster$phid ## TODO: reload roster and map onto the existing data roster_csv <-"~/analysis_data/100625_wmf_roster_final.csv" roster_df <- read.csv(roster_csv, header = TRUE) # need to figure out the change to longitudinally situated bool library(lubridate) clean_end_date <- function(date_col) { sapply(date_col, function(x) { if (is.na(x) || x == "") { # NA or empty: use current time return(Sys.time()) } else if (grepl("^\\d{4}$", x)) { # Only year: use 06/06 of that year return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y")) } else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) { # MM/DD/YYYY: parse directly return(as.POSIXct(x, format = "%m/%d/%Y")) } else { # Anything else, return NA or current time as fallback return(Sys.time()) } }) } clean_start_date <- function(date_col) { default_date <- as.POSIXct("06/20/2003", format = "%m/%d/%Y", tz = "UTC") sapply(date_col, function(x) { if (is.na(x) || x == "") { return(default_date) } else if (grepl("^\\d{4}$", x)) { return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y", tz = "UTC")) } else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) { return(as.POSIXct(x, format = "%m/%d/%Y", tz = "UTC")) } else { return(default_date) } }) } roster_df <- roster_df |> mutate( cleaned_start_date = clean_start_date(start.date), cleaned_end_date = clean_end_date(end.date), AuthorPHID = PhabricatorPHID ) #gerrit bot: PHID-USER-idceizaw6elwiwm5xshb' main_w_seniority_and_affil <- main_w_seniority %>% left_join(roster_df, by = "AuthorPHID") %>% mutate( isAuthorWMF_prelim = (date_created >= cleaned_start_date & date_created <= cleaned_end_date) | AuthorPHID %in% old_phids, isGerritBot = AuthorPHID == 'PHID-USER-idceizaw6elwiwm5xshb') |> replace_na(list(isAuthorWMF_prelim = FALSE)) |> select(-name, -start.date, -end.date, -team, -humanID, -phabricatorNick, -PhabricatorPHID, -X, -notes, -sources, -X.1, -X.2, -cleaned_start_date, -cleaned_end_date) #TODO: bugzilla, if from bugzilla and contains specific string, isAuthorWMF is true #bzimport PHID: PHID-USER-ynivjflmc2dcl6w5ut5v main_w_seniority_and_affil <- main_w_seniority_and_affil |> mutate( bugzilla_wmf = ifelse( AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v" & !is.na(bzName) & bzName != "" & bzName != "NA" & !is.na(comment_text) & ( str_starts(comment_text, fixed(paste0("**Author:** `", bzName, "`"))) | str_starts(comment_text, fixed(paste0("**", bzName, "** wrote:"))) ), TRUE, FALSE ), isAuthorWMF = ifelse( AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v", bugzilla_wmf, isAuthorWMF_prelim ) ) |> select(-bugzilla_wmf, -isAuthorWMF_prelim, -bzName) #NO roster affiliates found through the bugzilla matching table(main_w_seniority_and_affil$isAuthorWMF) write.csv(main_w_seniority_and_affil, "100625_unified_w_affil.csv", row.names = FALSE)