18
0
Files
RCommunityData/R/wikia_admin.R

171 lines
6.2 KiB
R

# Community Data Science Collective R Utilities
#
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
# mako@atdot.cc, aaronshaw@northwestern.edu
# privileges of interest:
# a shared variable that gets used everywhere
generate.admin.addrm <- function (logevents, current.admins) {
# convert types of a few variables
logevents$ancient <- logevents$ancient == "true"
logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp)
logevents$rights.new[is.na(logevents$rights.new)] <- ""
logevents$rights.old[is.na(logevents$rights.old)] <- ""
# TODO do wikia wikis have these =?
# in WP, all of these are negated by one day
logevents <- logevents[!(logevents$ancient & logevents$comment == "="),]
##########################################
### Parsing logevents file
#########################################
# separate out moderns & ancients and the necessary columns
ancients <- logevents[logevents$ancient,c("title","comment","timestamp")]
moderns <- logevents[!logevents$ancient,
c("title","rights.new","rights.old","timestamp")]
# function that looks at rights.old, rights.new and returns a value of
# privilege, add/remove, and timestamp for each user
parse.moderns <- function (i, d) {
user <- sub('^User:', "", d[i,"title"])
change.time <- d[i,"timestamp"]
rights.new <- d[i,"rights.new"]
rights.old <- d[i,"rights.old"]
# create a vector of new and old rights:
destring <- function (x) { strsplit(as.character(x), ", ")[[1]] }
# create a list of privileges that are mentioned
privileges <- unique(c(destring(rights.new),
destring(rights.old)))
# create T/F vectors incidating which privileges were added/removed
added <- privileges[privileges %in% destring(rights.new) &
!(privileges %in% destring(rights.old))]
removed <- privileges[!(privileges %in% destring(rights.new)) &
privileges %in% destring(rights.old)]
# assemble the data frame of: role,action,user,timestamp
data.frame(user=rep(user, length(c(added,removed))),
role=c(added, removed),
action=c(rep("added",length(added)),
rep("removed", length(removed))),
timestamp=rep(change.time, length(c(added,removed))),
era=rep("modern", length(c(added,removed))),
stringsAsFactors=FALSE)
}
# if there are log events, and there are non-ancients, we parse them
if (dim(logevents)[1] & !all(logevents$ancient)) {
moderns.parsed <- do.call("rbind",
lapply(1:dim(moderns)[1], parse.moderns, moderns))
} else {
moderns.parsed = NULL
}
# another function to handle processing the ancients:
parse.ancient <- function (i, d) {
user <- sub('^.*?:', '', d[i,"title"])
comment <- d[i, "comment"]
change.time <- d[i, "timestamp"]
added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', '))
# clean any leadin, trailing whitespace
added <- gsub("^\\s+|\\s+$", "", added)
data.frame(user=user,
role=added,
action="added",
timestamp=change.time,
era="ancient",
stringsAsFactors=FALSE)
}
# if there are any ancients, we parse them
if (any(logevents$ancient)) {
ancients.parsed <- do.call("rbind",
lapply(1:dim(ancients)[1], parse.ancient, ancients))
} else {
ancients.parsed = NULL
}
combined <- rbind(moderns.parsed, ancients.parsed)
##########################################
### Parsing current.admins file
#########################################
# turn each of the columns after the first two into logical
# function to process pre.ancients
parse.current.admins <- function (i, d) {
user <- d[i, "username"]
roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]])
o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE)
colnames(o) <- c("user", "role")
return(o)
}
current.admins.parsed <- do.call("rbind",
lapply(1:dim(current.admins)[1],
parse.current.admins, current.admins))
# select pre-ancients as people who have a given right *today* but
# were never seen as having it added
is.pre.ancients <- function (i, d, combined) {
user <- d[i, "user"]
role <- d[i, "role"]
# look to see if we've see any events with this user and role added:
# if we see none, this is pre-ancient
!any(combined$user == user &
combined$role == role &
combined$action == "added")
}
# create the list of pre-ancients (people role combinations we have
# not seen in the logevents data
pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1],
is.pre.ancients,
current.admins.parsed,
combined),]
# make a list of people who have been removed
combined.removed <- combined[combined$action == "removed",]
if (!is.null(combined.removed)) {
if (dim(combined.removed)[1] > 0) {
combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1],
function (i,d) {
user <- d[i,"user"]
role <- d[i,"role"]
timestamp <- d[i,"timestamp"]
# was the person added before they were removed? OR in the pre-ancients
any(combined$user == user &
combined$role == role &
combined$action == "added" &
combined$timestamp <= timestamp) | (user %in% pre.ancients$user)
}, combined.removed),c("user", "role")]
}
}
pre.ancients <- rbind(pre.ancients, combined.removed)
# give them the earliest ancient timestamp minus 1 day
pre.ancients$action <- "added"
pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440
pre.ancients$era <- "pre.ancient"
# and then add the pre.ancients to the
combined <- rbind(combined, pre.ancients)
# remove redunandt actions
combined <- combined[!duplicated(combined),]
return(combined)
}