mw-lifecycle-analysis/analysis_data/roster_confirmation.R

library(tidyverse)
library(jsonlite)
library(lubridate)
## TODO: get the within-case seniority
main_csv <-"~/analysis_data/100325_unified_phab.csv"
main_df <- read.csv(main_csv, header = TRUE)

main_df <- main_df %>%
  mutate(source = factor(source, levels = c("c1", "c2", "c3")))

main_w_seniority <- main_df %>%
  arrange(AuthorPHID, source, date_created) %>%
  group_by(AuthorPHID) %>%
  mutate(c2_c3_index = ifelse(source %in% c("c2", "c3"), row_number() - sum(source == "c1"), NA_integer_),
         comment_index = case_when(
           source == "c1" ~ row_number(),
           source %in% c("c2", "c3") ~ c2_c3_index
         )) %>%
  ungroup() |>
  select(-c2_c3_index)

# load in existing 'WMF in name' tags
WMF_in_name <- "~/analysis_data/022825_wmf_master_phab_roster.json"
old_roster <- fromJSON(WMF_in_name)
old_phids <- old_roster$phid
## TODO: reload roster and map onto the existing data
roster_csv <-"~/analysis_data/100625_wmf_roster_final.csv"
roster_df <- read.csv(roster_csv, header = TRUE)
# need to figure out the change to longitudinally situated bool
library(lubridate)

clean_end_date <- function(date_col) {
  sapply(date_col, function(x) {
    if (is.na(x) || x == "") {
      # NA or empty: use current time
      return(Sys.time())
    } else if (grepl("^\\d{4}$", x)) {
      # Only year: use 06/06 of that year
      return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y"))
    } else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) {
      # MM/DD/YYYY: parse directly
      return(as.POSIXct(x, format = "%m/%d/%Y"))
    } else {
      # Anything else, return NA or current time as fallback
      return(Sys.time())
    }
  })
}

clean_start_date <- function(date_col) {
  default_date <- as.POSIXct("06/20/2003", format = "%m/%d/%Y", tz = "UTC")
  sapply(date_col, function(x) {
    if (is.na(x) || x == "") {
      return(default_date)
    } else if (grepl("^\\d{4}$", x)) {
      return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y", tz = "UTC"))
    } else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) {
      return(as.POSIXct(x, format = "%m/%d/%Y", tz = "UTC"))
    } else {
      return(default_date)
    }
  })
}

roster_df <- roster_df |>
  mutate(
    cleaned_start_date = clean_start_date(start.date),
    cleaned_end_date = clean_end_date(end.date),
    AuthorPHID = PhabricatorPHID
  )

#gerrit bot: PHID-USER-idceizaw6elwiwm5xshb'
main_w_seniority_and_affil <- main_w_seniority %>%
  left_join(roster_df, by = "AuthorPHID") %>%
  mutate(
    isAuthorWMF_prelim = (date_created >= cleaned_start_date & date_created <= cleaned_end_date) |
      AuthorPHID %in% old_phids,
    isGerritBot = AuthorPHID == 'PHID-USER-idceizaw6elwiwm5xshb') |>
  replace_na(list(isAuthorWMF_prelim = FALSE)) |>
  select(-name, -start.date, -end.date, -team,
         -humanID, -phabricatorNick, -PhabricatorPHID, -X,
         -notes, -sources, -X.1, -X.2, -cleaned_start_date, -cleaned_end_date)

#TODO: bugzilla, if from bugzilla and contains specific string, isAuthorWMF is true
#bzimport PHID: PHID-USER-ynivjflmc2dcl6w5ut5v
main_w_seniority_and_affil <- main_w_seniority_and_affil |>
  mutate(
    bugzilla_wmf = ifelse(
      AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v" &
        !is.na(bzName) & bzName != "" & bzName != "NA" &
        !is.na(comment_text) & (
          str_starts(comment_text, fixed(paste0("**Author:** `", bzName, "`"))) |
            str_starts(comment_text, fixed(paste0("**", bzName, "** wrote:")))
        ),
      TRUE, FALSE
    ),
    isAuthorWMF = ifelse(
      AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v",
      bugzilla_wmf,
      isAuthorWMF_prelim
    )
  ) |>
  select(-bugzilla_wmf, -isAuthorWMF_prelim, -bzName)
#NO roster affiliates found through the bugzilla matching
table(main_w_seniority_and_affil$isAuthorWMF)

write.csv(main_w_seniority_and_affil, "100625_unified_w_affil.csv", row.names = FALSE)