added r data module

2017-02-24 17:03:37 -08:00
commit 7d05f4ae70
10 changed files with 388 additions and 0 deletions
--- a/R/hhi.R
+++ b/R/hhi.R
@@ -0,0 +1,17 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+## functions to create normal and non-normalized herfenidahl indexes
+hhi <- function (x) {
+  x <- x / sum(x)
+  sum(x**2)
+}
+
+hhi.norm <- function (x) {
+  n <- length(x)
+  h <- hhi(x)
+  (h - 1/n)/(1-1/n)
+}
+
--- a/R/load_if_missing.R
+++ b/R/load_if_missing.R
@@ -0,0 +1,24 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+# load a file if a variable is missing
+load.if.missing <- function (var.name, file.name) {
+  if (!exists(var.name)) {
+    load(file.name, parent.frame())
+
+    # check to see if we're dealing with a data.table because, if we
+    # are, we need to do some nasty back and forth
+    if (class(eval(as.name(var.name)))[1] == "data.table") {
+
+      # gnarly function that loads resorts things within the parent
+      # frame to get around the bug in data.table
+      assign(var.name,
+             data.table(as.data.frame(eval(as.name(var.name))),
+                        key=attr(eval(as.name(var.name)), "sorted")),
+             parent.frame())
+    }
+  }
+}
+
--- a/R/namespaces.R
+++ b/R/namespaces.R
@@ -0,0 +1,59 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+## functions to deal with namespace information
+#####################################################################
+load.wikia.namespaces <- function () {
+    # load namespace data
+    wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv",
+                                   stringsAsFactors=TRUE, header=FALSE)
+
+    colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string")
+    wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num)
+    return(wikia.namespaces)
+}
+
+# enwiki - move to barnstars directory
+# TODO: TEST
+load.enwiki.namespaces <- function(){
+  enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+                     14, 15, 100, 101, 108, 109)
+  
+  names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk",
+                            "Wikipedia", "Wikipedia talk","File", "File talk",
+                            "MediaWiki", "MediaWiki talk", "Template", "Template talk",
+                            "Help", "Help talk", "Category", "Category talk",
+                            "Portal", "Portal talk", "Book","Book talk")
+}
+
+# function to take a list of article titles and a wiki name and return
+# a list of numbered namespaces
+titles.to.ns.num <- function (page.titles, wiki) {
+    # load wikia namespace data from disk if it does not exist
+    if (!exists("wikia.namespaces")) {
+        wikia.namespaces <- load.wikia.namespaces()
+    }
+    
+    # page.titles <- d$title # DEBUG 
+    ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki,
+                                c("ns.num", "ns.string")]
+
+    namespaces <- as.character(ns.df$ns.num)
+    names(namespaces) <- ns.df$ns.string
+
+    # drop the zero, we'll deal with it later
+    namespaces <- namespaces [!namespaces == 0]
+    
+    # change underscores to spaces (necessary?)
+    page.titles <- gsub('_', ' ', page.titles)
+    page.ns <- rep("0", length(page.titles))
+
+    for (ns in names(namespaces)) {
+        page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns]
+    }
+
+    # return the list of namespaces as a factor
+    return(as.factor(page.ns))
+}
--- a/R/wikia_admin.R
+++ b/R/wikia_admin.R
@@ -0,0 +1,170 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+# privileges of interest:
+# a shared variable that gets used everywhere
+generate.admin.addrm <- function (logevents, current.admins) {
+
+  # convert types of a few variables
+  logevents$ancient <- logevents$ancient == "true"
+  logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp)
+  logevents$rights.new[is.na(logevents$rights.new)] <- ""
+  logevents$rights.old[is.na(logevents$rights.old)] <- ""
+
+  # TODO do wikia wikis have these =?
+  # in WP, all of these are negated by one day
+  logevents <- logevents[!(logevents$ancient & logevents$comment == "="),]
+  
+  ##########################################
+  ###  Parsing logevents file
+  #########################################
+
+  # separate out moderns & ancients and the necessary columns
+  ancients <- logevents[logevents$ancient,c("title","comment","timestamp")]
+  moderns <- logevents[!logevents$ancient,
+                       c("title","rights.new","rights.old","timestamp")]
+  
+  # function that looks at rights.old, rights.new and returns a value of
+  # privilege, add/remove, and timestamp for each user
+  parse.moderns <- function (i, d) {
+    user <- sub('^User:', "", d[i,"title"])
+    change.time <- d[i,"timestamp"]
+    rights.new <- d[i,"rights.new"]
+    rights.old <- d[i,"rights.old"]
+    
+    # create a vector of new and old rights:
+    destring <- function (x) { strsplit(as.character(x), ", ")[[1]] }
+
+    # create a list of privileges that are mentioned
+    privileges <- unique(c(destring(rights.new),
+                           destring(rights.old)))
+
+    # create T/F vectors incidating which privileges were added/removed
+    added <- privileges[privileges %in% destring(rights.new) &
+                        !(privileges %in% destring(rights.old))]
+    removed <- privileges[!(privileges %in% destring(rights.new)) &
+                          privileges %in% destring(rights.old)]
+
+    # assemble the data frame of: role,action,user,timestamp
+    data.frame(user=rep(user, length(c(added,removed))),
+               role=c(added, removed),
+               action=c(rep("added",length(added)),
+                 rep("removed", length(removed))),
+               timestamp=rep(change.time, length(c(added,removed))),
+               era=rep("modern", length(c(added,removed))),
+               stringsAsFactors=FALSE)
+  }
+
+  # if there are log events, and there are non-ancients, we parse them
+  if (dim(logevents)[1] & !any(logevents$ancient)) {
+    moderns.parsed <- do.call("rbind",
+                              lapply(1:dim(moderns)[1], parse.moderns, moderns))
+  } else {
+    moderns.parsed = NULL
+  }
+  
+  # another function to handle processing the ancients:
+  parse.ancient <- function (i, d) {
+    user <- sub('^.*?:', '', d[i,"title"])
+    comment <- d[i, "comment"]
+    change.time <- d[i, "timestamp"]
+
+    added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', '))
+
+    # clean any leadin, trailing whitespace
+    added <- gsub("^\\s+|\\s+$", "", added)
+
+    data.frame(user=user,
+               role=added,
+               action="added",
+               timestamp=change.time,
+               era="ancient",
+               stringsAsFactors=FALSE)
+  }
+
+  # if there are any ancients, we parse them
+  if (any(logevents$ancient)) {
+    ancients.parsed <- do.call("rbind",
+                               lapply(1:dim(ancients)[1], parse.ancient, ancients))
+  } else {
+    ancients.parsed = NULL
+  }
+
+  combined <- rbind(moderns.parsed, ancients.parsed)
+  
+  ##########################################
+  ###  Parsing current.admins file
+  #########################################
+  # turn each of the columns after the first two into logical
+
+  # function to process pre.ancients
+  parse.current.admins <- function (i, d) {
+    user <- d[i, "username"]
+    roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]])
+
+    o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE)
+    colnames(o) <- c("user", "role")
+    return(o)
+  }
+
+  current.admins.parsed <- do.call("rbind",
+                                   lapply(1:dim(current.admins)[1],
+                                          parse.current.admins, current.admins))
+    
+  # select pre-ancients as people who have a given right *today* but
+  # were never seen as having it added
+  is.pre.ancients <- function (i, d, combined) {
+    user <- d[i, "user"]
+    role <- d[i, "role"]
+
+    # look to see if we've see any events with this user and role added:
+    # if we see none, this is pre-ancient
+    !any(combined$user == user &
+         combined$role == role &
+         combined$action == "added")
+
+  }
+
+  # create the list of pre-ancients (people role combinations we have
+  # not seen in the logevents data
+  pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1],
+                                               is.pre.ancients,
+                                               current.admins.parsed,
+                                               combined),]
+
+  # make a list of people who have been removed
+  combined.removed <- combined[combined$action == "removed",]
+  if (!is.null(combined.removed)) {
+    if (dim(combined.removed)[1] > 0) {
+      combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1],
+                                                  function (i,d) {
+        user <- d[i,"user"]
+        role <- d[i,"role"]
+        timestamp <- d[i,"timestamp"]
+
+        # was the person added before they were removed? OR in the pre-ancients
+        any(combined$user == user &
+            combined$role == role &
+            combined$action == "added" &
+            combined$timestamp <= timestamp) | (user %in% pre.ancients$user)
+      }, combined.removed),c("user", "role")]
+    }
+  }
+
+  pre.ancients <- rbind(pre.ancients, combined.removed)
+  
+  # give them the earliest ancient timestamp minus 1 day
+  pre.ancients$action <- "added"
+  pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440
+  pre.ancients$era <- "pre.ancient"
+  
+  # and then add the pre.ancients to the 
+  combined <- rbind(combined, pre.ancients)
+
+  # remove redunandt actions
+  combined <- combined[!duplicated(combined),]
+  return(combined)
+}
+
--- a/R/wikiq.R
+++ b/R/wikiq.R
@@ -0,0 +1,86 @@
+# Community Data Science Collective R Utilities
+#
+# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
+# mako@atdot.cc, aaronshaw@northwestern.edu
+
+# loads simple utility functions for use in the subsequent files
+
+# store this for re-use across various scripts
+wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon",
+                  "editor", "editor_id", "minor", "text_size",
+                  "text_entropy", "text_md5", "reversion",
+                  "additions_size", "deletions_size", "edits",
+                  "articles", "users")
+
+# helper function to load the TSV files our perl scripts are generating
+load.extracted.df <- function (filename) {
+  read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE)
+}
+
+# helper function to grab the classes of all columns of a dataframe
+# keep this because it's being used but this can just be lapply(d, class)
+get.col.classes <- function (d) {
+  sapply(colnames(d), function (col) { class(d[,col]) })
+}
+
+# convert mediawiki timestamps into POSIXct
+timestamp.to.POSIXct <- function (ts.string)  {
+  ts.string <- gsub("T", " ", ts.string)
+  ts.string <- gsub("Z", "", ts.string)
+  return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC"))
+}
+
+
+read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) {
+  d <- read.delim(con, stringsAsFactors=FALSE, header=header,
+                  encoding="UTF-8", quote="")
+
+  # rename date.time to timestamp and remove _
+  colnames(d)[colnames(d) == "date.time"] <- "timestamp"
+  colnames(d) <- sub("_", ".", colnames(d))
+  
+  d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2",
+                                d$timestamp), tz="UTC")
+
+  # convert reversion to a logical
+  d$reversion <- !is.na(d$reversion)
+
+  if (detect.reverts) {
+      # reorder so we cannow find the order and timestamp
+      d <- d[order(d$title, d$timestamp),]
+      
+      # generate a list of reverted editors and a list of previous and next md5
+      d$reverted <- c(d$reversion[2:length(d$reversion)],NA)
+      d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA)
+      d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)])
+      d$reverted <- d$reverted & (d$md5.next == d$md5.prev)
+
+      # drop the extra columns and the last edit
+      d <- d[!is.na(d$reverted),]
+      d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")]
+  
+      # create a reverted by variable by shifting up the editors and
+      # then NAing nonreverts
+      d$reverted.by <- c(d$editor[2:length(d$reversion)], NA)
+      d$reverted.by[!d$reverted] <- NA
+  }  
+  # set ip address to the username and create a new variable
+  d$ipaddress <- d$editor == ""
+  d$editor[d$editor == ""] <- d$editor.id[d$editor == ""]
+  
+  # delete the connection
+  return(d)
+}
+
+# TODO refactor this so that we clean the data BEFORE we read it into R
+# ATM, this is set to only work on 14 item issues
+
+# see the vereins wiki for "Philcomputing" and 29 lines that seem to
+# have a newline in the editor name
+read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) {
+  con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'"))
+  d <- read.wikiq(con, header=header, detect.reverts=detect.reverts)
+  rm(con)
+  return(d)
+}
+