108 lines
3.8 KiB
R
108 lines
3.8 KiB
R
library(tidyverse)
|
|
library(jsonlite)
|
|
library(lubridate)
|
|
## TODO: get the within-case seniority
|
|
main_csv <-"~/analysis_data/100325_unified_phab.csv"
|
|
main_df <- read.csv(main_csv, header = TRUE)
|
|
|
|
main_df <- main_df %>%
|
|
mutate(source = factor(source, levels = c("c1", "c2", "c3")))
|
|
|
|
main_w_seniority <- main_df %>%
|
|
arrange(AuthorPHID, source, date_created) %>%
|
|
group_by(AuthorPHID) %>%
|
|
mutate(c2_c3_index = ifelse(source %in% c("c2", "c3"), row_number() - sum(source == "c1"), NA_integer_),
|
|
comment_index = case_when(
|
|
source == "c1" ~ row_number(),
|
|
source %in% c("c2", "c3") ~ c2_c3_index
|
|
)) %>%
|
|
ungroup() |>
|
|
select(-c2_c3_index)
|
|
|
|
# load in existing 'WMF in name' tags
|
|
WMF_in_name <- "~/analysis_data/022825_wmf_master_phab_roster.json"
|
|
old_roster <- fromJSON(WMF_in_name)
|
|
old_phids <- old_roster$phid
|
|
## TODO: reload roster and map onto the existing data
|
|
roster_csv <-"~/analysis_data/100625_wmf_roster_final.csv"
|
|
roster_df <- read.csv(roster_csv, header = TRUE)
|
|
# need to figure out the change to longitudinally situated bool
|
|
library(lubridate)
|
|
|
|
clean_end_date <- function(date_col) {
|
|
sapply(date_col, function(x) {
|
|
if (is.na(x) || x == "") {
|
|
# NA or empty: use current time
|
|
return(Sys.time())
|
|
} else if (grepl("^\\d{4}$", x)) {
|
|
# Only year: use 06/06 of that year
|
|
return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y"))
|
|
} else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) {
|
|
# MM/DD/YYYY: parse directly
|
|
return(as.POSIXct(x, format = "%m/%d/%Y"))
|
|
} else {
|
|
# Anything else, return NA or current time as fallback
|
|
return(Sys.time())
|
|
}
|
|
})
|
|
}
|
|
|
|
clean_start_date <- function(date_col) {
|
|
default_date <- as.POSIXct("06/20/2003", format = "%m/%d/%Y", tz = "UTC")
|
|
sapply(date_col, function(x) {
|
|
if (is.na(x) || x == "") {
|
|
return(default_date)
|
|
} else if (grepl("^\\d{4}$", x)) {
|
|
return(as.POSIXct(paste0("06/06/", x), format = "%m/%d/%Y", tz = "UTC"))
|
|
} else if (grepl("^\\d{2}/\\d{2}/\\d{4}$", x)) {
|
|
return(as.POSIXct(x, format = "%m/%d/%Y", tz = "UTC"))
|
|
} else {
|
|
return(default_date)
|
|
}
|
|
})
|
|
}
|
|
|
|
roster_df <- roster_df |>
|
|
mutate(
|
|
cleaned_start_date = clean_start_date(start.date),
|
|
cleaned_end_date = clean_end_date(end.date),
|
|
AuthorPHID = PhabricatorPHID
|
|
)
|
|
|
|
#gerrit bot: PHID-USER-idceizaw6elwiwm5xshb'
|
|
main_w_seniority_and_affil <- main_w_seniority %>%
|
|
left_join(roster_df, by = "AuthorPHID") %>%
|
|
mutate(
|
|
isAuthorWMF_prelim = (date_created >= cleaned_start_date & date_created <= cleaned_end_date) |
|
|
AuthorPHID %in% old_phids,
|
|
isGerritBot = AuthorPHID == 'PHID-USER-idceizaw6elwiwm5xshb') |>
|
|
replace_na(list(isAuthorWMF_prelim = FALSE)) |>
|
|
select(-name, -start.date, -end.date, -team,
|
|
-humanID, -phabricatorNick, -PhabricatorPHID, -X,
|
|
-notes, -sources, -X.1, -X.2, -cleaned_start_date, -cleaned_end_date)
|
|
|
|
#TODO: bugzilla, if from bugzilla and contains specific string, isAuthorWMF is true
|
|
#bzimport PHID: PHID-USER-ynivjflmc2dcl6w5ut5v
|
|
main_w_seniority_and_affil <- main_w_seniority_and_affil |>
|
|
mutate(
|
|
bugzilla_wmf = ifelse(
|
|
AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v" &
|
|
!is.na(bzName) & bzName != "" & bzName != "NA" &
|
|
!is.na(comment_text) & (
|
|
str_starts(comment_text, fixed(paste0("**Author:** `", bzName, "`"))) |
|
|
str_starts(comment_text, fixed(paste0("**", bzName, "** wrote:")))
|
|
),
|
|
TRUE, FALSE
|
|
),
|
|
isAuthorWMF = ifelse(
|
|
AuthorPHID == "PHID-USER-ynivjflmc2dcl6w5ut5v",
|
|
bugzilla_wmf,
|
|
isAuthorWMF_prelim
|
|
)
|
|
) |>
|
|
select(-bugzilla_wmf, -isAuthorWMF_prelim, -bzName)
|
|
#NO roster affiliates found through the bugzilla matching
|
|
table(main_w_seniority_and_affil$isAuthorWMF)
|
|
|
|
write.csv(main_w_seniority_and_affil, "100625_unified_w_affil.csv", row.names = FALSE)
|