1
0

Initial commit

p#	new file:   runwikiq.sh
This commit is contained in:
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$

3
RCommunityData/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
.Rproj.user
.Rhistory
.RData

View File

@@ -0,0 +1,9 @@
Package: RCommunityData
Title: library of functions used in communitydata packages
Version: 0.1
Authors@R: person("Benjamin Mako", "Hill", email = "mako@atdot.cc", role = c("aut", "cre"))
Description: library of functions used in communitydata packages
Depends: R (>= 3.0)
License: GPLv3+
Encoding: UTF-8
LazyData: true

2
RCommunityData/NAMESPACE Normal file
View File

@@ -0,0 +1,2 @@
# Generated by roxygen2: fake comment so roxygen2 overwrites silently.
exportPattern("^[^\\.]")

17
RCommunityData/R/hhi.R Normal file
View File

@@ -0,0 +1,17 @@
# Community Data Science Collective R Utilities
#
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
# mako@atdot.cc, aaronshaw@northwestern.edu
## functions to create normal and non-normalized herfenidahl indexes
hhi <- function (x) {
x <- x / sum(x)
sum(x**2)
}
hhi.norm <- function (x) {
n <- length(x)
h <- hhi(x)
(h - 1/n)/(1-1/n)
}

View File

@@ -0,0 +1,24 @@
# Community Data Science Collective R Utilities
#
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
# mako@atdot.cc, aaronshaw@northwestern.edu
# load a file if a variable is missing
load.if.missing <- function (var.name, file.name) {
if (!exists(var.name)) {
load(file.name, parent.frame())
# check to see if we're dealing with a data.table because, if we
# are, we need to do some nasty back and forth
if (class(eval(as.name(var.name)))[1] == "data.table") {
# gnarly function that loads resorts things within the parent
# frame to get around the bug in data.table
assign(var.name,
data.table(as.data.frame(eval(as.name(var.name))),
key=attr(eval(as.name(var.name)), "sorted")),
parent.frame())
}
}
}

View File

@@ -0,0 +1,59 @@
# Community Data Science Collective R Utilities
#
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
# mako@atdot.cc, aaronshaw@northwestern.edu
## functions to deal with namespace information
#####################################################################
load.wikia.namespaces <- function () {
# load namespace data
wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv",
stringsAsFactors=TRUE, header=FALSE)
colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string")
wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num)
return(wikia.namespaces)
}
# enwiki - move to barnstars directory
# TODO: TEST
load.enwiki.namespaces <- function(){
enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 100, 101, 108, 109)
names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk",
"Wikipedia", "Wikipedia talk","File", "File talk",
"MediaWiki", "MediaWiki talk", "Template", "Template talk",
"Help", "Help talk", "Category", "Category talk",
"Portal", "Portal talk", "Book","Book talk")
}
# function to take a list of article titles and a wiki name and return
# a list of numbered namespaces
titles.to.ns.num <- function (page.titles, wiki) {
# load wikia namespace data from disk if it does not exist
if (!exists("wikia.namespaces")) {
wikia.namespaces <- load.wikia.namespaces()
}
# page.titles <- d$title # DEBUG
ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki,
c("ns.num", "ns.string")]
namespaces <- as.character(ns.df$ns.num)
names(namespaces) <- ns.df$ns.string
# drop the zero, we'll deal with it later
namespaces <- namespaces [!namespaces == 0]
# change underscores to spaces (necessary?)
page.titles <- gsub('_', ' ', page.titles)
page.ns <- rep("0", length(page.titles))
for (ns in names(namespaces)) {
page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns]
}
# return the list of namespaces as a factor
return(as.factor(page.ns))
}

View File

@@ -0,0 +1,184 @@
# Community Data Science Collective R Utilities
#
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
# mako@atdot.cc, aaronshaw@northwestern.edu
# privileges of interest:
# a shared variable that gets used everywhere
generate.admin.addrm <- function (logevents, current.admins) {
# convert types of a few variables
logevents$ancient <- logevents$ancient == "true"
logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp)
logevents$rights.new[is.na(logevents$rights.new)] <- ""
logevents$rights.old[is.na(logevents$rights.old)] <- ""
# TODO do wikia wikis have these =?
# in WP, all of these are negated by one day
logevents <- logevents[!(logevents$ancient & logevents$comment == "="),]
##########################################
### Parsing logevents file
#########################################
# separate out moderns & ancients and the necessary columns
ancients <- logevents[logevents$ancient,c("title","comment","timestamp")]
moderns <- logevents[!logevents$ancient,
c("title","rights.new","rights.old","timestamp")]
# function that looks at rights.old, rights.new and returns a value of
# privilege, add/remove, and timestamp for each user
parse.moderns <- function (i, d) {
user <- sub('^User:', "", d[i,"title"])
change.time <- d[i,"timestamp"]
rights.new <- d[i,"rights.new"]
rights.old <- d[i,"rights.old"]
# create a vector of new and old rights:
destring <- function (x) { strsplit(as.character(x), ", ")[[1]] }
# create a list of privileges that are mentioned
privileges <- unique(c(destring(rights.new),
destring(rights.old)))
# create T/F vectors incidating which privileges were added/removed
added <- privileges[privileges %in% destring(rights.new) &
!(privileges %in% destring(rights.old))]
removed <- privileges[!(privileges %in% destring(rights.new)) &
privileges %in% destring(rights.old)]
# assemble the data frame of: role,action,user,timestamp
data.frame(user=rep(user, length(c(added,removed))),
role=c(added, removed),
action=c(rep("added",length(added)),
rep("removed", length(removed))),
timestamp=rep(change.time, length(c(added,removed))),
era=rep("modern", length(c(added,removed))),
stringsAsFactors=FALSE)
}
# if there are log events, and there are non-ancients (not all are ancients), we parse them
if (dim(logevents)[1] & !all(logevents$ancient)) {
moderns.parsed <- do.call("rbind",
lapply(1:dim(moderns)[1], parse.moderns, moderns))
} else {
moderns.parsed = NULL
}
# another function to handle processing the ancients:
parse.ancient <- function (i, d) {
user <- sub('^.*?:', '', d[i,"title"])
comment <- d[i, "comment"]
change.time <- d[i, "timestamp"]
added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', '))
# clean any leadin, trailing whitespace
added <- gsub("^\\s+|\\s+$", "", added)
data.frame(user=user,
role=added,
action="added",
timestamp=change.time,
era="ancient",
stringsAsFactors=FALSE)
}
# if there are any ancients, we parse them
if (any(logevents$ancient)) {
ancients.parsed <- do.call("rbind",
lapply(1:dim(ancients)[1], parse.ancient, ancients))
} else {
ancients.parsed = NULL
}
combined <- rbind(moderns.parsed, ancients.parsed)
##########################################
### Parsing current.admins file
#########################################
# turn each of the columns after the first two into logical
# function to process pre.ancients
parse.current.admins <- function (i, d) {
user <- d[i, "username"]
roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]])
o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE)
colnames(o) <- c("user", "role")
return(o)
}
## handle the case where there are no admins. This can happen on Wikipedia
if(dim(current.admins)[1] != 0){
current.admins.parsed <- do.call("rbind",
lapply(1:dim(current.admins)[1],
parse.current.admins, current.admins))
}
else{
current.admins.parsed <- NULL
}
# select pre-ancients as people who have a given right *today* but
# were never seen as having it added
is.pre.ancients <- function (i, d, combined) {
user <- d[i, "user"]
role <- d[i, "role"]
# look to see if we've see any events with this user and role added:
# if we see none, this is pre-ancient
!any(combined$user == user &
combined$role == role &
combined$action == "added")
}
if(!is.null(current.admins.parsed)){
# create the list of pre-ancients (people role combinations we have
# not seen in the logevents data
pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1],
is.pre.ancients,
current.admins.parsed,
combined),]
}
else{
pre.ancients <- NULL
}
# make a list of people who have been removed
combined.removed <- combined[combined$action == "removed",]
if (!is.null(combined.removed)) {
if (dim(combined.removed)[1] > 0) {
combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1],
function (i,d) {
user <- d[i,"user"]
role <- d[i,"role"]
timestamp <- d[i,"timestamp"]
# was the person added before they were removed? OR in the pre-ancients
any(combined$user == user &
combined$role == role &
combined$action == "added" &
combined$timestamp <= timestamp) | (user %in% pre.ancients$user)
}, combined.removed),c("user", "role")]
}
}
pre.ancients <- rbind(pre.ancients, combined.removed)
# give them the earliest ancient timestamp minus 1 day
# and then add the pre.ancients to the
if(!is.null(pre.ancients)){
pre.ancients$action <- "added"
pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440
pre.ancients$era <- "pre.ancient"
combined <- rbind(combined, pre.ancients)
}
# remove redunandt actions
combined <- combined[!duplicated(combined),]
return(combined)
}

86
RCommunityData/R/wikiq.R Normal file
View File

@@ -0,0 +1,86 @@
# Community Data Science Collective R Utilities
#
# Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw
# mako@atdot.cc, aaronshaw@northwestern.edu
# loads simple utility functions for use in the subsequent files
# store this for re-use across various scripts
wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon",
"editor", "editor_id", "minor", "text_size",
"text_entropy", "text_md5", "reversion",
"additions_size", "deletions_size", "edits",
"articles", "users")
# helper function to load the TSV files our perl scripts are generating
load.extracted.df <- function (filename) {
read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE)
}
# helper function to grab the classes of all columns of a dataframe
# keep this because it's being used but this can just be lapply(d, class)
get.col.classes <- function (d) {
sapply(colnames(d), function (col) { class(d[,col]) })
}
# convert mediawiki timestamps into POSIXct
timestamp.to.POSIXct <- function (ts.string) {
ts.string <- gsub("T", " ", ts.string)
ts.string <- gsub("Z", "", ts.string)
return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC"))
}
read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) {
d <- read.delim(con, stringsAsFactors=FALSE, header=header,
encoding="UTF-8", quote="")
# rename date.time to timestamp and remove _
colnames(d)[colnames(d) == "date.time"] <- "timestamp"
colnames(d) <- sub("_", ".", colnames(d))
d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2",
d$timestamp), tz="UTC")
# convert reversion to a logical
d$reversion <- !is.na(d$reversion)
if (detect.reverts) {
# reorder so we cannow find the order and timestamp
d <- d[order(d$title, d$timestamp),]
# generate a list of reverted editors and a list of previous and next md5
d$reverted <- c(d$reversion[2:length(d$reversion)],NA)
d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA)
d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)])
d$reverted <- d$reverted & (d$md5.next == d$md5.prev)
# drop the extra columns and the last edit
d <- d[!is.na(d$reverted),]
d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")]
# create a reverted by variable by shifting up the editors and
# then NAing nonreverts
d$reverted.by <- c(d$editor[2:length(d$reversion)], NA)
d$reverted.by[!d$reverted] <- NA
}
# set ip address to the username and create a new variable
d$ipaddress <- d$editor == ""
d$editor[d$editor == ""] <- d$editor.id[d$editor == ""]
# delete the connection
return(d)
}
# TODO refactor this so that we clean the data BEFORE we read it into R
# ATM, this is set to only work on 14 item issues
# see the vereins wiki for "Philcomputing" and 29 lines that seem to
# have a newline in the editor name
read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) {
con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'"))
d <- read.wikiq(con, header=header, detect.reverts=detect.reverts)
rm(con)
return(d)
}

View File

@@ -0,0 +1,16 @@
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
Encoding: UTF-8
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace