From 7d05f4ae7042d36ad5a7ca2828d96be25d32d1ac Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 24 Feb 2017 17:03:37 -0800 Subject: [PATCH] added r data module --- .Rbuildignore | 2 + .gitignore | 3 + DESCRIPTION | 9 +++ NAMESPACE | 2 + R/hhi.R | 17 +++++ R/load_if_missing.R | 24 ++++++ R/namespaces.R | 59 +++++++++++++++ R/wikia_admin.R | 170 +++++++++++++++++++++++++++++++++++++++++++ R/wikiq.R | 86 ++++++++++++++++++++++ RCommunityData.Rproj | 16 ++++ 10 files changed, 388 insertions(+) create mode 100644 .Rbuildignore create mode 100644 .gitignore create mode 100644 DESCRIPTION create mode 100644 NAMESPACE create mode 100644 R/hhi.R create mode 100644 R/load_if_missing.R create mode 100644 R/namespaces.R create mode 100644 R/wikia_admin.R create mode 100644 R/wikiq.R create mode 100644 RCommunityData.Rproj diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..807ea25 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.Rproj.user +.Rhistory +.RData diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..2a53d7b --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,9 @@ +Package: RCommunityData +Title: library of functions used in communitydata packages +Version: 0.1 +Authors@R: person("Benjamin Mako", "Hill", email = "mako@atdot.cc", role = c("aut", "cre")) +Description: library of functions used in communitydata packages +Depends: R (>= 3.0) +License: GPLv3+ +Encoding: UTF-8 +LazyData: true diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/hhi.R b/R/hhi.R new file mode 100644 index 0000000..d17cb3e --- /dev/null +++ b/R/hhi.R @@ -0,0 +1,17 @@ +# Community Data Science Collective R Utilities +# +# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw +# mako@atdot.cc, aaronshaw@northwestern.edu + +## functions to create normal and non-normalized herfenidahl indexes +hhi <- function (x) { + x <- x / sum(x) + sum(x**2) +} + +hhi.norm <- function (x) { + n <- length(x) + h <- hhi(x) + (h - 1/n)/(1-1/n) +} + diff --git a/R/load_if_missing.R b/R/load_if_missing.R new file mode 100644 index 0000000..4143886 --- /dev/null +++ b/R/load_if_missing.R @@ -0,0 +1,24 @@ +# Community Data Science Collective R Utilities +# +# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw +# mako@atdot.cc, aaronshaw@northwestern.edu + +# load a file if a variable is missing +load.if.missing <- function (var.name, file.name) { + if (!exists(var.name)) { + load(file.name, parent.frame()) + + # check to see if we're dealing with a data.table because, if we + # are, we need to do some nasty back and forth + if (class(eval(as.name(var.name)))[1] == "data.table") { + + # gnarly function that loads resorts things within the parent + # frame to get around the bug in data.table + assign(var.name, + data.table(as.data.frame(eval(as.name(var.name))), + key=attr(eval(as.name(var.name)), "sorted")), + parent.frame()) + } + } +} + diff --git a/R/namespaces.R b/R/namespaces.R new file mode 100644 index 0000000..0f96399 --- /dev/null +++ b/R/namespaces.R @@ -0,0 +1,59 @@ +# Community Data Science Collective R Utilities +# +# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw +# mako@atdot.cc, aaronshaw@northwestern.edu + +## functions to deal with namespace information +##################################################################### +load.wikia.namespaces <- function () { + # load namespace data + wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv", + stringsAsFactors=TRUE, header=FALSE) + + colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string") + wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num) + return(wikia.namespaces) +} + +# enwiki - move to barnstars directory +# TODO: TEST +load.enwiki.namespaces <- function(){ + enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 100, 101, 108, 109) + + names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk", + "Wikipedia", "Wikipedia talk","File", "File talk", + "MediaWiki", "MediaWiki talk", "Template", "Template talk", + "Help", "Help talk", "Category", "Category talk", + "Portal", "Portal talk", "Book","Book talk") +} + +# function to take a list of article titles and a wiki name and return +# a list of numbered namespaces +titles.to.ns.num <- function (page.titles, wiki) { + # load wikia namespace data from disk if it does not exist + if (!exists("wikia.namespaces")) { + wikia.namespaces <- load.wikia.namespaces() + } + + # page.titles <- d$title # DEBUG + ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki, + c("ns.num", "ns.string")] + + namespaces <- as.character(ns.df$ns.num) + names(namespaces) <- ns.df$ns.string + + # drop the zero, we'll deal with it later + namespaces <- namespaces [!namespaces == 0] + + # change underscores to spaces (necessary?) + page.titles <- gsub('_', ' ', page.titles) + page.ns <- rep("0", length(page.titles)) + + for (ns in names(namespaces)) { + page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns] + } + + # return the list of namespaces as a factor + return(as.factor(page.ns)) +} diff --git a/R/wikia_admin.R b/R/wikia_admin.R new file mode 100644 index 0000000..00e067c --- /dev/null +++ b/R/wikia_admin.R @@ -0,0 +1,170 @@ +# Community Data Science Collective R Utilities +# +# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw +# mako@atdot.cc, aaronshaw@northwestern.edu + +# privileges of interest: +# a shared variable that gets used everywhere +generate.admin.addrm <- function (logevents, current.admins) { + + # convert types of a few variables + logevents$ancient <- logevents$ancient == "true" + logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp) + logevents$rights.new[is.na(logevents$rights.new)] <- "" + logevents$rights.old[is.na(logevents$rights.old)] <- "" + + # TODO do wikia wikis have these =? + # in WP, all of these are negated by one day + logevents <- logevents[!(logevents$ancient & logevents$comment == "="),] + + ########################################## + ### Parsing logevents file + ######################################### + + # separate out moderns & ancients and the necessary columns + ancients <- logevents[logevents$ancient,c("title","comment","timestamp")] + moderns <- logevents[!logevents$ancient, + c("title","rights.new","rights.old","timestamp")] + + # function that looks at rights.old, rights.new and returns a value of + # privilege, add/remove, and timestamp for each user + parse.moderns <- function (i, d) { + user <- sub('^User:', "", d[i,"title"]) + change.time <- d[i,"timestamp"] + rights.new <- d[i,"rights.new"] + rights.old <- d[i,"rights.old"] + + # create a vector of new and old rights: + destring <- function (x) { strsplit(as.character(x), ", ")[[1]] } + + # create a list of privileges that are mentioned + privileges <- unique(c(destring(rights.new), + destring(rights.old))) + + # create T/F vectors incidating which privileges were added/removed + added <- privileges[privileges %in% destring(rights.new) & + !(privileges %in% destring(rights.old))] + removed <- privileges[!(privileges %in% destring(rights.new)) & + privileges %in% destring(rights.old)] + + # assemble the data frame of: role,action,user,timestamp + data.frame(user=rep(user, length(c(added,removed))), + role=c(added, removed), + action=c(rep("added",length(added)), + rep("removed", length(removed))), + timestamp=rep(change.time, length(c(added,removed))), + era=rep("modern", length(c(added,removed))), + stringsAsFactors=FALSE) + } + + # if there are log events, and there are non-ancients, we parse them + if (dim(logevents)[1] & !any(logevents$ancient)) { + moderns.parsed <- do.call("rbind", + lapply(1:dim(moderns)[1], parse.moderns, moderns)) + } else { + moderns.parsed = NULL + } + + # another function to handle processing the ancients: + parse.ancient <- function (i, d) { + user <- sub('^.*?:', '', d[i,"title"]) + comment <- d[i, "comment"] + change.time <- d[i, "timestamp"] + + added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', ')) + + # clean any leadin, trailing whitespace + added <- gsub("^\\s+|\\s+$", "", added) + + data.frame(user=user, + role=added, + action="added", + timestamp=change.time, + era="ancient", + stringsAsFactors=FALSE) + } + + # if there are any ancients, we parse them + if (any(logevents$ancient)) { + ancients.parsed <- do.call("rbind", + lapply(1:dim(ancients)[1], parse.ancient, ancients)) + } else { + ancients.parsed = NULL + } + + combined <- rbind(moderns.parsed, ancients.parsed) + + ########################################## + ### Parsing current.admins file + ######################################### + # turn each of the columns after the first two into logical + + # function to process pre.ancients + parse.current.admins <- function (i, d) { + user <- d[i, "username"] + roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]]) + + o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE) + colnames(o) <- c("user", "role") + return(o) + } + + current.admins.parsed <- do.call("rbind", + lapply(1:dim(current.admins)[1], + parse.current.admins, current.admins)) + + # select pre-ancients as people who have a given right *today* but + # were never seen as having it added + is.pre.ancients <- function (i, d, combined) { + user <- d[i, "user"] + role <- d[i, "role"] + + # look to see if we've see any events with this user and role added: + # if we see none, this is pre-ancient + !any(combined$user == user & + combined$role == role & + combined$action == "added") + + } + + # create the list of pre-ancients (people role combinations we have + # not seen in the logevents data + pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1], + is.pre.ancients, + current.admins.parsed, + combined),] + + # make a list of people who have been removed + combined.removed <- combined[combined$action == "removed",] + if (!is.null(combined.removed)) { + if (dim(combined.removed)[1] > 0) { + combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1], + function (i,d) { + user <- d[i,"user"] + role <- d[i,"role"] + timestamp <- d[i,"timestamp"] + + # was the person added before they were removed? OR in the pre-ancients + any(combined$user == user & + combined$role == role & + combined$action == "added" & + combined$timestamp <= timestamp) | (user %in% pre.ancients$user) + }, combined.removed),c("user", "role")] + } + } + + pre.ancients <- rbind(pre.ancients, combined.removed) + + # give them the earliest ancient timestamp minus 1 day + pre.ancients$action <- "added" + pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440 + pre.ancients$era <- "pre.ancient" + + # and then add the pre.ancients to the + combined <- rbind(combined, pre.ancients) + + # remove redunandt actions + combined <- combined[!duplicated(combined),] + return(combined) +} + diff --git a/R/wikiq.R b/R/wikiq.R new file mode 100644 index 0000000..76c9dcf --- /dev/null +++ b/R/wikiq.R @@ -0,0 +1,86 @@ +# Community Data Science Collective R Utilities +# +# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw +# mako@atdot.cc, aaronshaw@northwestern.edu + +# loads simple utility functions for use in the subsequent files + +# store this for re-use across various scripts +wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon", + "editor", "editor_id", "minor", "text_size", + "text_entropy", "text_md5", "reversion", + "additions_size", "deletions_size", "edits", + "articles", "users") + +# helper function to load the TSV files our perl scripts are generating +load.extracted.df <- function (filename) { + read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE) +} + +# helper function to grab the classes of all columns of a dataframe +# keep this because it's being used but this can just be lapply(d, class) +get.col.classes <- function (d) { + sapply(colnames(d), function (col) { class(d[,col]) }) +} + +# convert mediawiki timestamps into POSIXct +timestamp.to.POSIXct <- function (ts.string) { + ts.string <- gsub("T", " ", ts.string) + ts.string <- gsub("Z", "", ts.string) + return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC")) +} + + +read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) { + d <- read.delim(con, stringsAsFactors=FALSE, header=header, + encoding="UTF-8", quote="") + + # rename date.time to timestamp and remove _ + colnames(d)[colnames(d) == "date.time"] <- "timestamp" + colnames(d) <- sub("_", ".", colnames(d)) + + d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2", + d$timestamp), tz="UTC") + + # convert reversion to a logical + d$reversion <- !is.na(d$reversion) + + if (detect.reverts) { + # reorder so we cannow find the order and timestamp + d <- d[order(d$title, d$timestamp),] + + # generate a list of reverted editors and a list of previous and next md5 + d$reverted <- c(d$reversion[2:length(d$reversion)],NA) + d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA) + d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)]) + d$reverted <- d$reverted & (d$md5.next == d$md5.prev) + + # drop the extra columns and the last edit + d <- d[!is.na(d$reverted),] + d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")] + + # create a reverted by variable by shifting up the editors and + # then NAing nonreverts + d$reverted.by <- c(d$editor[2:length(d$reversion)], NA) + d$reverted.by[!d$reverted] <- NA + } + # set ip address to the username and create a new variable + d$ipaddress <- d$editor == "" + d$editor[d$editor == ""] <- d$editor.id[d$editor == ""] + + # delete the connection + return(d) +} + +# TODO refactor this so that we clean the data BEFORE we read it into R +# ATM, this is set to only work on 14 item issues + +# see the vereins wiki for "Philcomputing" and 29 lines that seem to +# have a newline in the editor name +read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) { + con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'")) + d <- read.wikiq(con, header=header, detect.reverts=detect.reverts) + rm(con) + return(d) +} + diff --git a/RCommunityData.Rproj b/RCommunityData.Rproj new file mode 100644 index 0000000..d848a9f --- /dev/null +++ b/RCommunityData.Rproj @@ -0,0 +1,16 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +Encoding: UTF-8 + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace