Initial commit

p# new file: runwikiq.sh
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions
--- a/00_count_editors.R
+++ b/00_count_editors.R
@@ -0,0 +1,28 @@
 #!/usr/bin/env Rscript
 ## script that saves the number of unique editors to a wiki. For use with parallelsql
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 source("lib-00-utils.R")
 opt <- commandArgs(trailingOnly = TRUE)
 input.dir <- "../wikiq_wikia_2010_all_nopersistance/"
 output.dir <- "../wikiq_wikia_2010_unique_editors/"
 d <- load.wikiq.file(paste0(input.dir,opt[1]))
 n.editors <- length(unique(d[anon == FALSE & namespace == 0, editor.id]))
 write(n.editors,file=paste0(output.dir,gsub("\\.tsv$",'.editors',opt[1])),append=FALSE)
--- a/00_select_wikis.R
+++ b/00_select_wikis.R
@@ -0,0 +1,44 @@
 #!usr/bin/env Rscript
 ## Script used to choose the top 1% of wikis to analyze
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library("ggplot2")
 library("data.table")
 counts.dir <- "../wikiq_wikia_2010_unique_editors/"
 files <- list.files(counts.dir)
 read.count.file <- function(f){
    return(read.csv(paste0(counts.dir,f),header=FALSE))
 }
 dbname <- gsub("\\.editors",'',files)
 counts <- c(sapply(files,read.count.file))
 counts <- unlist(counts,use.names=FALSE)
 dt <- data.table(wiki=dbname,n.editors=counts)
 #ggplot(dt,aes(x=n.editors)) + stat_ecdf(geom="step") + scale_x_log10(minor_breaks=10**(1:10/2)) + scale_y_continuous(minor_breaks=1:20/20)
 top_1_percentile = quantile(x=dt$n.editors,probs=(1:99)/100)[99]
 ## lets take all with > 100. This is very close to the top 1%, but it involves nice round numbers :)
 wiki.list <- dt[n.editors >= top_1_percentile]
 wiki.list[is.na(url),':='(url=paste0("http://",wiki,".wikia.com/"))]
 wiki.list$wiki.type="wikia"
 fwrite(wiki.list,"selected.wikis.csv")
--- a/01_build_datasets.R
+++ b/01_build_datasets.R
@@ -0,0 +1,77 @@
 #!/usr/bin/env Rscript
 # Top level script for building datasets. 
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library(data.table)
 library(parallel)
 plot.distribution <- function(data,varname,save=TRUE){
    x = data[[varname]];
    print(paste("plotting distribution for",varname))
    if(save){
        pdf(paste0("plots/",varname,".distribution.pdf"))
    }
    ## overlay histogram, empirical density and normal density
    if(class(x) == "logical"){
        p0 <- qplot(x)
    }
    else{
        p0 = qplot(x, geom = 'blank') +   
        geom_line(aes(y = ..density.., colour = 'Empirical'), stat = 'density') +  
        geom_histogram(aes(y = ..density..), alpha = 0.4,bins=100) +                        
        scale_colour_manual(name = 'Density', values = c('red', 'blue')) + 
        theme(legend.position = c(0.85, 0.85))
    }    
    print(p0)
    if(save){
    dev.off()
    }
 }
 if(!exists("wiki.list")){
    source("lib-00-utils.R",echo=TRUE)
 }
 if(!exists("bots") | !exists("admins")){
    if(file.exists("bots.RDS") & file.exists("admins.RDS")){
        bots = readRDS("bots.RDS")
        admins = readRDS("admins.RDS")
    }
    else {
        source("lib-01-generate_userroles.R",echo=TRUE)
    }
 }
 if(!exists("newcomer.dt")){
    intermediate.files <- list("newcomers.RDS","wikiweeks.RDS","wiki.stats.RDS","active.editors.RDS")
    if(! all(sapply(intermediate.files,function (x) file.exists(x)))){
        source("lib-01-build_newcomer_table.R",echo=TRUE)
    }
 }
 plot.distributions = FALSE
 if(plot.distributions == TRUE){
    library(ggplot2)
    ## plot distributions for model 1
    outcome1 <- c("survives")
    predictors1 <- c("is.reverted","is.messaged","is.bot.reverted","is.reverted.messaged","is.admin.reverted","BRD.initiation","BRD.reciprocation")
    controls1 <- c("ns0.edits","ns1.edits","ns4.edits","n.other.wikis","week","has.edited.other.wikis","n.edits.other","n.messages","n.editors","total.wiki.length","revert.rate","revert.disc.rate","newcomer.revert.disc.rate","revert.message.rate","newcomer.revert.message.rate","newcomer.edits.rate","bot.revert.rate","bot.revert.prop","newcomer.bot.revert.rate","newcomer.bot.revert.prop","admin.revert.rate","admin.revert.prop","n.ns4.edits","n.ns4.editors","d.ns4.length","ns4.editor.age","age","wiki.age")
    for(varname in c(outcome1,predictors1,controls1)){
        plot.distribution(newcomers,varname)
    }
 }
--- a/02_model_newcomer_survival.R
+++ b/02_model_newcomer_survival.R
@@ -0,0 +1,162 @@
 #!/usr/bin/env Rscript
 # Fits newcomer retention models 
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library(scales)
 if(!exists("newcomers")){
    source("01_build_datasets.R")
 }
 use.sample <- FALSE
 if(use.sample == TRUE){
    source("lib-01-sample-datasets.R")
    newcomer.ds <- sample.newcomers()
 }else{
    newcomer.ds <- newcomers    
 }
 library("optimx")
 library("lme4")
 newcomer.ds <- newcomer.ds[,wiki:=as.factor(wiki.name)]
 newcomer.ds <- newcomer.ds[,":="(
    wiki.age.log = log1p(as.double(wiki.age,units='weeks')),
    is.bot.reverted = ifelse(is.na(is.bot.reverted),FALSE,is.bot.reverted),
    is.admin.reverted = ifelse(is.na(is.admin.reverted),FALSE,is.admin.reverted),
    year = as.factor(year(time.first.edit)),
    month = as.factor(paste0(year(time.first.edit),month(time.first.edit))),
    ns0.edits.log = log1p(ns0.edits),
    ns1.edits.log = log1p(ns1.edits),
    ns4.edits.log = log1p(ns4.edits),
    n.other.wikis.log = log1p(n.other.wikis),
    n.edits.other.log = log1p(n.edits.other),
    n.messages.log = log1p(n.messages),
    n.editors.log = log1p(n.editors),
    total.wiki.length.log = log1p(total.wiki.length),
    n.ns4.edits.log = log1p(n.ns4.edits),
    n.ns4.editors.log = log1p(n.ns4.editors),
    ns4.editor.age.log = log1p(as.double(ns4.editor.age,units='years')),
    d.ns4.length.scaled = scale(d.ns4.length),
    newcomer.chars.changed.scaled = scale(newcomer.chars.change),
    session.edits.log = log1p(session.edits),
    wiki.age = as.double(wiki.age,units='years')
 )]
 ## record summary stats for our analytic variables
 newcomer.summary.stats <- list()
 newcomer.summary.stats$p.survives <- mean(newcomer.ds$survives)
 newcomer.summary.stats$var.survives <- var(newcomer.ds$survives)
 outliers <- newcomer.ds[session.edits >= 100]
 newcomer.summary.stats$N.outliers <- nrow(outliers)
 newcomer.summary.stats$p.first.session.no.outliers <- mean(newcomer.ds[session.edits < 100]$session.edits)
 newcomer.summary.stats$var.first.session.no.outliers <- var(newcomer.ds[session.edits < 100]$session.edits)
 newcomer.summary.stats$p.reverted <- mean(newcomer.ds$is.reverted)
 newcomer.summary.stats$var.reverted <- var(newcomer.ds$is.reverted)
 newcomer.summary.stats$p.messaged <- mean(newcomer.ds$is.messaged)
 newcomer.summary.stats$var.messaged <- var(newcomer.ds$is.messaged)
 newcomer.summary.stats$mean.first.session.edits <- mean(newcomer.ds$session.edits)
 newcomer.summary.stats$var.first.session.edits <- var(newcomer.ds$session.edits)
 newcomer.summary.stats$med.first.session.edits <- median(newcomer.ds$session.edits)
 newcomer.summary.stats$p.bot.reverted <- mean(newcomer.ds$is.bot.reverted)
 newcomer.summary.stats$var.bot.reverted <- var(newcomer.ds$is.bot.reverted)
 remember(newcomer.summary.stats)
 halfak.formula <-  as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki.age + quarter + wiki.name")
 newcomer.ds.all <- newcomer.ds
 newcomer.ds <- newcomer.ds[n.other.wikis==0]
 print('fitting halfak model on all newcomers')
 halfak.mod.all.newcomers <- glm(halfak.formula,data=newcomer.ds.all,family=binomial(link=logit))
 saveRDS(halfak.mod.all.newcomers,"halfak.mod.all.newcomers.RDS")
 remember(extract(halfak.mod.all.newcomers),"halfak.model.all.newcomers",silent=TRUE)
 print("fitting halfak model")
 halfak.mod <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit))
 saveRDS(halfak.mod,"halfak.mod.RDS")
 remember(extract(halfak.mod),"halfak.model",silent=TRUE)
 print('fitting halfak model with weights')
 n.total.wikis <- length(unique(newcomer.ds$wiki.name))
 weight.per.wiki <- nrow(newcomer.ds)/n.total.wikis
 newcomer.ds <- newcomer.ds[,weights:=weight.per.wiki/.N,by=wiki.name]
 halfak.mod.weighted <- glm(halfak.formula,data=newcomer.ds,family=binomial(link=logit),weights=newcomer.ds$weights)
 saveRDS(halfak.mod.weighted,"halfak.mod.weighted.RDS")
 remember(extract(halfak.mod.weighted),"halfak.model.weighted",silent=TRUE)
 ## print('fit halfak model on a sample')
 ## sample.size <- 30
 ## newcomer.ds <- newcomer.ds[,in.sample:=.N >= sample.size, by=wiki.name]
 ## newcomer.ds.sample <- newcomer.ds[,.SD[sample(.N,min(sample.size,.N))],by=wiki.name]
 ## halfak.mod.sample <- glm(halfak.formula,data=newcomer.ds.sample,family=binomial(link=logit))
 ## saveRDS(halfak.mod.sample,"halfak.mod.sample.RDS")
 ## remember(extract(halfak.mod.sample),"halfak.model.sample",silent=TRUE)
 print('fitting RE model')
 library("optimx")
 print('fitting re model')
 re.icc.survives.model <- glmer(as.formula("survives ~ + (1 | wiki) - 1"),data=newcomer.ds,family=binomial(link=logit))
 saveRDS(re.icc.survives.model,"re.icc.survives.model.RDS")
 varcorrmat <- as.data.table(VarCorr(re.icc.survives.model))
 wiki.var <- varcorrmat[grp=='wiki' & var1=="(Intercept)" ,vcov]
 group.var <- var(residuals(re.icc.survives.model))
 icc <- wiki.var/(group.var + wiki.var)
 remember(varcorrmat,'icc.survives.varcormat')
 remember(group.var,'icc.survives.group.var')
 remember(icc,'icc.survives')
 ## newcomer.no.pooling.f <- as.formula("survives ~ is.reverted:wiki.name + is.messaged:wiki.name + is.bot.reverted:wiki.name + session.edits.log:wiki.name + wiki.name + quarter:wiki.name + wiki.name:wiki.age - 1")
 ## newcomer.no.pooling.mod <- glm(newcomer.no.pooling.f,gdata=newcomer.ds,family=binomial(link=logit))
 ## remember(extract(newcomer.no.pooling.mod),"newcomer.no.pooling.mod",silent=TRUE)
 ## if( !(exists("halfak.robustnes1.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
 ##         halfak.robustness1.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits.log + wiki + quarter + wiki:wiki.age")
 ##         print("fitting halfak robustness 1 model")
 ##         newcomer.robustness.ds <- newcomer.ds[p.reverted <= 0.05]
 ##         halfak.robustness1.mod <- glm(halfak.robustness1.formula,data=newcomer.robustness.ds,family=binomial(link=logit))
 ##         saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
 ##         remember(extract(halfak.robustness1.mod),"halfak.robustness1.model")
 ##     }
 ##     else if(file.exists("halfak.robustness1.mod.RDS") & !exists("halfak.robustness1.mod")){
 ##         newcomer.no.pooling.mod <- readRDS("halfak.robustness1.mod.RDS")
 ##     }
 ##     else if (exists("halfak.robustness1.mod")){
 ##         saveRDS(halfak.robustness1.mod,"halfak.robustness1.mod.RDS")
 ##     }
 ##         remember(extract(halfak.robustness1.mod),"halfak.robustness1.mod")
 ## }
 ## if( !(exists("halfak.robustnes2.mod") | file.exists("halfak.robustness1.mod.RDS")) | refit.models == TRUE){
 ##         halfak.robustness2.formula <- as.formula("survives ~ is.reverted + is.messaged + is.bot.reverted + session.edits .log + wiki + quarter + wiki:wiki.age")
 ##         print("fitting halfak robustness 2 model")
 ##         newcomer.robustness.ds2 <- newcomer.ds[p.reverted <= 0.5]
 ##         halfak.robustness2.mod <- glm(halfak.robustness2.formula,data=newcomer.robustness.ds2,family=binomial(link=logit))
 ##         saveRDS(halfak.robustness1.mod,"halfak.robustness2.mod.RDS")
 ##         remember(extract(halfak.robustness1.mod),"halfak.robustness2.model")
 ##     }
 ##     else if(file.exists("halfak.robustness2.mod.RDS") & !exists("halfak.robustness2.mod")){
 ##         halfak.robustness2.mod <- readRDS("halfak.robustness2.mod.RDS")
 ##     }
 ##     else if (exists("halfak.robustness2.mod")){
 ##         saveRDS(halfak.robustness2.mod,"halfak.robustness2.mod.RDS")
 ##     }
 ##         remember(extract(halfak.robustness2.mod),"halfak.robustness2.mod")
 ## }
--- a/03_generate_plots.R
+++ b/03_generate_plots.R
@@ -0,0 +1,97 @@
 #!/usr/bin/env Rscript
 # Creates data for plotting
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library("ggplot2")
 library("bootstrap")
 library("scales")
 source("lib-00-utils.R")
 if(!exists("newcomers")){
    source("01_build_datasets.R")
 }
 remember(min(all.edits$date.time),"earliest.data.point")
 remember(max(all.edits$date.time),"latest.data.point")
 p1.data <- newcomers[,.(p.reverted = mean(is.reverted),
                          var.reverted=var(is.reverted),
                          p.survives=mean(survives),
                          var.survives=(var(survives)),
                          N=.N),
                       by=.(wiki.name,wiki.age.half.years)]
 p1.data <- p1.data[N>1]
 p1.data[,N.wikis := .N, by = .(wiki.age.half.years)]
 ## put p1 data onto sd scales
 p1.data[,p.survives.in.sd := p.survives/sd(p.survives),by=.(wiki.name)]
 p1.data[,p.reverted.in.sd := p.reverted/sd(p.reverted),by=.(wiki.name)]
 p.data <- melt(p1.data,id.vars=c("wiki.name","wiki.age.half.years"),measure.vars=c("p.survives","p.reverted","p.survives.in.sd","p.reverted.in.sd"))
 p.stats <- p.data[,as.list(c(setNames(boxplot.stats(value,coef=1.5)$stats,c("min","q1","med","q3","max")),
                             mu=mean(value),N.wikis=.N)),by=.(wiki.age.half.years,variable)]
 remember(p.stats)
 p.stats[variable=="p.survives"]$variable="Survives"
 p.stats[variable=="p.reverted"]$variable="Reverted"
 remember(cor.test(p1.data$wiki.age.half.years,p1.data$p.survives,method='spearman',alternative='less'),"survives.cor.test")
 remember(cor.test(p1.data$wiki.age.half.years,p1.data$p.reverted,method='spearman',alternative='greater'),"reverted.cor.test")
 xlabels = paste0("Year ", 0:max(p.stats$wiki.age.half.years))
 p <- ggplot(p.stats,aes(x=as.factor(wiki.age.half.years),ymin=min,lower=q1,middle=med,upper=q3,ymax=max,width=0.3))
 p <- p + geom_boxplot(stat='identity')
 p <- p + geom_line(aes(x=wiki.age.half.years+1,y=med), linetype=2)
 p <- p + facet_wrap("variable",nrow=2,strip.position="bottom",scales="free")
 p <- p + scale_y_continuous(name="Proportion of newcomers",minor_breaks=NULL) + scale_x_discrete(name="Wiki age", labels=xlabels)
 p <- p + theme_bw()  + theme(legend.position="None")
 pdf(width=6,height=6)
 print(p)
 dev.off()
 active.editors <- all.edits[,
                            .(N.edits=.N,
                              wiki.age.years=first(wiki.age.years)),
                            by=.(wiki.name,
                                 editor,
                                 wiki.age.months)]
 n.active.editors <- active.editors[N.edits >= 5,
                                   .(N.active.editors = .N,
                                     wiki.age.years=first(wiki.age.years)),
                                   by=.(wiki.name,wiki.age.months)]
 n.active.editors[, ":="(N=.N), by=.(wiki.age.months)]
 n.active.editors[,":="(max.age=max(wiki.age.months),max.active.editors=max(N.active.editors),sd.units.active.editors=N.active.editors/sd(N.active.editors)),by="wiki.name"]
 n.active.editors[,":="(active.editors.pmax=N.active.editors/max.active.editors)]
 wiki.age.quantile <- .90
 max.age.months <- quantile(n.active.editors$max.age,wiki.age.quantile)
 boot <- n.active.editors[is.finite(sd.units.active.editors)&wiki.age.months <= max.age.months,.(thetastar = bootstrap(x=sd.units.active.editors,nboot=5000,mean)$thetastar),by=.(wiki.age.months)]
 boot.ci <- boot[,as.list(quantile(thetastar,probs=c(0.025,0.975))),by=.(wiki.age.months)]
 names(boot.ci) <- c("wiki.age.months","lower.ci","upper.ci")
 plot2.data <- n.active.editors[is.finite(sd.units.active.editors) & wiki.age.months <= max.age.months,.(sd.units.active.editors = mean(sd.units.active.editors),N.active.editors = mean(N.active.editors),wiki.age.years=first(wiki.age.years),N.wikis=.N),by=.(wiki.age.months)]
 plot2.data[boot.ci,":="(lower.ci=lower.ci,upper.ci=upper.ci),on="wiki.age.months"]
 remember(plot2.data,'plot.active.editors.dt')
--- a/04_model_namespace4.R
+++ b/04_model_namespace4.R
@@ -0,0 +1,149 @@
 #!/usr/bin/env Rscript
 # Fits models predicting reverions of namespace 4 edits
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library(effects)
 library(texreg)
 library(lme4)
 if(!exists("newcomers")){
    source("01_build_datasets.R")
 }
 nosave <- FALSE
 sample <- FALSE
 if(sample == TRUE){
    source("lib-01-sample-datasets.R")
    ns4.ds <- sample.ns4.edits()
    weights <- ns4.ds$weight
 }else{
    ns4.ds <- ns4.reg.edits
 }
 ns4.ds <- ns4.ds[,":="(wiki.age.log = log1p(as.double(wiki.age,units="years")),
             age.log = log1p(as.double(age,units="years")),
             wiki.age = as.double(wiki.age,units='years'),
             quarter = as.factor(paste0(year(date.time),"_",ceiling(month(date.time)/4))),
             age = as.double(age,units='years'))]
 ns4.ds <- ns4.ds[,":="(time.first.wikia.edit = min(time.first.edit)),by=.(editor)]
 ns4.ds.all.newcomers <- ns4.ds
 ns4.ds <- ns4.ds[time.first.wikia.edit == time.first.edit]
 ns4.summary.stats <- list()
 ns4.summary.stats$p.reverted <- mean(ns4.ds$reverted)
 ns4.summary.stats$var.reverted <- var(ns4.ds$reverted)
 ns4.summary.stats$mean.editor.age <- mean(ns4.ds$age)
 ns4.summary.stats$var.editor.age <- var(ns4.ds$age)
 ns4.summary.stats$median.editor.age <- median(ns4.ds$age)
 ns4.summary.stats$mean.wiki.age <- mean(ns4.ds$wiki.age)
 ns4.summary.stats$var.wiki.age <- var(ns4.ds$wiki.age)
 ns4.summary.stats$median.wiki.age <- median(ns4.ds$wiki.age)
 remember(ns4.summary.stats)
 print('fit morgan model')
 f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
 morgan.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'))
 saveRDS(morgan.model,"morgan.model.RDS")
 remember(extract(morgan.model),"morgan.model",silent=TRUE)
 print('fit morgan model weights')
 f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
 n.total.wikis <- length(unique(ns4.ds$wiki.name))
 weight.per.wiki <- nrow(ns4.ds)/n.total.wikis
 ns4.ds <- ns4.ds[,weights := weight.per.wiki/.N, by=.(wiki.name)]
 morgan.model.weighted <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=ns4.ds$weights)
 saveRDS(morgan.model.weighted,"morgan.model.weighted.RDS")
 remember(extract(morgan.model.weighted),"morgan.model.weighted",silent=TRUE)
 print('fit morgan model weights')
 f.morgan <- as.formula("reverted ~ age + wiki.age + quarter + wiki.name")
 ns4.ds <- ns4.ds[,N:=.N,by=wiki.name]
 ns4.ds.temp <- ns4.ds
 min.edits <- 10
 remember(print(1 - length(unique(ns4.ds[N>=min.edits]$wiki.name))/length(unique(ns4.ds$wiki.name))),"p.wikis.removed.weighted2")
 # remove the bottom 24.1% of wikis
 ns4.ds <- ns4.ds[N>=min.edits]
 n.total.wikis <- length(unique(ns4.ds$wiki.name))
 weight.per.wiki <- nrow(ns4.ds)/n.total.wikis
 ns4.ds <- ns4.ds[,weights := weight.per.wiki/.N, by=.(wiki.name)]
 morgan.model.weighted2 <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=ns4.ds$weights)
 saveRDS(morgan.model.weighted2,"morgan.model.weighted2.RDS")
 remember(extract(morgan.model.weighted2),"morgan.model.weighted2",silent=TRUE)
 ns4.ds <- ns4.ds.temp
 print('fit morgan model all newcomers')
 morgan.model.all.newcomers <- glm(f.morgan,data=ns4.ds.all.newcomers,family=binomial(link='logit'))
 saveRDS(morgan.model.all.newcomers,"morgan.model.all.newcomers.RDS")
 remember(extract(morgan.model.all.newcomers),"morgan.model.all.newcomers",silent=TRUE)
 print('fitting RE model')
 re.icc.reverted.model <- glmer(as.formula("reverted ~ + (1 | wiki.name) -1 "),data=ns4.ds,family=binomial(link=logit))
 saveRDS(re.icc.reverted.model,"re.icc.reverted.model.RDS")
 varcorrmat <- as.data.table(VarCorr(re.icc.reverted.model))
 wiki.var <- varcorrmat[grp=='wiki.name' & var1=="(Intercept)" ,vcov]
 group.var <- var(residuals(re.icc.reverted.model))
 icc <- wiki.var/(group.var + wiki.var)
 remember(varcorrmat,'icc.reverted.varcorrmat')
 remember(group.var,'icc.reverted.group.var')
 remember(icc,'icc.reverted')    
 ## print("fit morgan model sample")
 ## sample.size <- 30
 ## ns4.ds <- ns4.ds[,in.sample:=(.N >= sample.size),by=wiki.name]
 ## # DT[,.SD[sample(.N, min(3,.N))],by = a]
 ## ns4.ds.equal.sample <- ns4.ds[,.SD[sample(.N,min(sample.size,.N))], by=wiki.name]
 ## morgan.model.sampled <- glm(f.morgan,data=ns4.ds.equal.sample,family=binomial(link='logit'))
 ## saveRDS(morgan.model.sampled,"morgan.model.sampled.RDS")
 ## remember(extract(morgan.model.sampled),"morgan.model.sampled",silent=TRUE)
 ## ns4.model2.formula <- as.formula("reverted ~ age.log + wiki.age + quarter")
 ## ns4.model2 <- glm(ns4.model2,data=ns4.ds,family=binomial(link='logit'),weights=weights)
 ## remember(extract(ns4.model2),"ns4.model2")
 ## print('fit morgan no pooling model')
 ## f.morgan <- as.formula("reverted ~ age.log + wiki.age + quarter + wiki.name:age.log + wiki.name:wiki.age")
 ## morgan.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=weights)
 ## remember(extract(morgan.model),"morgan.model")
 ## re.ns4.model <- glmer(as.formula("reverted ~ age.log + wiki.age + quarter | wiki.name"),data=ns4.ds,family=binomial(link='logit'),weights=weights)
 ## remember(extract(re.ns4.model),'re.ns4.model')
 ## print('fit morgan.robustness.1 model')
 ## f.morgan.robustness.1 <- as.formula("reverted ~ age.log + wiki.age + quarter + wiki.name")
 ## ns4.reg.edits.robustness <- build.namespace4.dataset(all.edits[p.reverted < 0.5])
 ## ns4.reg.edits.robustness[,":="(wiki.age.log = log1p(as.double(wiki.age,units="weeks")),
 ##                                age.log = log1p(as.double(age,units="weeks")),
 ##                                wiki.age = as.double(wiki.age,units='weeks'),
 ##                                quarter = as.factor(paste0(year(date.time),"_",ceiling(month(date.time)/4))))]
 ## morgan.robustness.1.model <- glm(f.morgan.robustness.1,data=pns4.reg.edits.robustness,family=binomial(link='logit'),weights=weights)
 ## saveRDS(morgan.robustness.1.model,"morgan.robustness.1.model.RDS")
 ## remember(extract(morgan.robustness.1.model),"morgan.robustness.1.model")
 ## ns4.ds[,":="(wiki.age.log = log1p(as.numeric(wiki.age,units="weeks")), age.log = log1p(as.numeric(age,units="weeks")))]
 ## f.ns4.2 <- as.formula("reverted ~ age.log + wiki.age.log + age.log|wiki.age.log + wiki.name")
 ## ns4.2.model <- glm(f.morgan,data=ns4.ds,family=binomial(link='logit'),weights=weights)
 ## remember(extract(ns4.2.model))
 ## summary statistics for namespace 4 edits
--- a/05_power_simulation.R
+++ b/05_power_simulation.R
@@ -0,0 +1,53 @@
 #!/usr/bin/env Rscript
 # Perform power analysis to assess whether we have enough data to study bots
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 source("lib-00-utils.R")
 library(effects)
 library(texreg)
 if(!exists("r")){
    source("lib-00-utils.R")
    source("01_build_datasets.R")
 }
 p.outcome <- r$newcomer.summary.stats$p.survives
 p.dv <- r$newcomer.summary.stats$p.bot.reverted
 n <- r$halfak.model@gof[5]
 sample.ds <- function(n,p.outcome,p.dv,eff = -0.01){
    dv <- rbinom(n=n,size=1,prob=p.dv)
    iv <- rbinom(n,size=1,prob=p.outcome)
    m1 <- glm(iv ~ 1, family=binomial(link='logit'))
    eta <- eff*dv + coef(m1)[1]
    p <- exp(eta)/(1+exp(eta))
    tmp <- runif(n)
    y <- (tmp < p)
    fit <- glm(y ~ dv,family=binomial(link='logit'))
    summary(fit)$coefficients[2,4]
 }
 eff <- -0.68
 remember(exp(-eff),"power.analysis.effect")
 pwr.test.sig.level <- 0.05
 remember(pwr.test.sig.level)
 n.power.sim <- 1000
 remember(n.power.sim)
 out <- replicate(n.power.sim,sample.ds(n,p.outcome,p.dv,eff=eff))
 remember(mean(out<pwr.test.sig.level),"pwr.test")
--- a/RCommunityData/.Rbuildignore
+++ b/RCommunityData/.Rbuildignore
@@ -0,0 +1,2 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
--- a/RCommunityData/.gitignore
+++ b/RCommunityData/.gitignore
@@ -0,0 +1,3 @@
 .Rproj.user
 .Rhistory
 .RData
--- a/RCommunityData/DESCRIPTION
+++ b/RCommunityData/DESCRIPTION
@@ -0,0 +1,9 @@
 Package: RCommunityData
 Title: library of functions used in communitydata packages
 Version: 0.1
 Authors@R: person("Benjamin Mako", "Hill", email = "mako@atdot.cc", role = c("aut", "cre"))
 Description: library of functions used in communitydata packages
 Depends: R (>= 3.0)
 License: GPLv3+
 Encoding: UTF-8
 LazyData: true
--- a/RCommunityData/NAMESPACE
+++ b/RCommunityData/NAMESPACE
@@ -0,0 +1,2 @@
 # Generated by roxygen2: fake comment so roxygen2 overwrites silently.
 exportPattern("^[^\\.]")
--- a/RCommunityData/R/hhi.R
+++ b/RCommunityData/R/hhi.R
@@ -0,0 +1,17 @@
 # Community Data Science Collective R Utilities
 #
 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
 # mako@atdot.cc, aaronshaw@northwestern.edu
 ## functions to create normal and non-normalized herfenidahl indexes
 hhi <- function (x) {
  x <- x / sum(x)
  sum(x**2)
 }
 hhi.norm <- function (x) {
  n <- length(x)
  h <- hhi(x)
  (h - 1/n)/(1-1/n)
 }
--- a/RCommunityData/R/load_if_missing.R
+++ b/RCommunityData/R/load_if_missing.R
@@ -0,0 +1,24 @@
 # Community Data Science Collective R Utilities
 #
 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
 # mako@atdot.cc, aaronshaw@northwestern.edu
 # load a file if a variable is missing
 load.if.missing <- function (var.name, file.name) {
  if (!exists(var.name)) {
    load(file.name, parent.frame())
    # check to see if we're dealing with a data.table because, if we
    # are, we need to do some nasty back and forth
    if (class(eval(as.name(var.name)))[1] == "data.table") {
      # gnarly function that loads resorts things within the parent
      # frame to get around the bug in data.table
      assign(var.name,
             data.table(as.data.frame(eval(as.name(var.name))),
                        key=attr(eval(as.name(var.name)), "sorted")),
             parent.frame())
    }
  }
 }
--- a/RCommunityData/R/namespaces.R
+++ b/RCommunityData/R/namespaces.R
@@ -0,0 +1,59 @@
 # Community Data Science Collective R Utilities
 #
 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
 # mako@atdot.cc, aaronshaw@northwestern.edu
 ## functions to deal with namespace information
 #####################################################################
 load.wikia.namespaces <- function () {
    # load namespace data
    wikia.namespaces <- read.delim("~/data/wikia_namespaces.tsv",
                                   stringsAsFactors=TRUE, header=FALSE)
    colnames(wikia.namespaces) <- c("wiki", "ns.num", "ns.string")
    wikia.namespaces$ns.num <- as.factor(wikia.namespaces$ns.num)
    return(wikia.namespaces)
 }
 # enwiki - move to barnstars directory
 # TODO: TEST
 load.enwiki.namespaces <- function(){
  enwiki.ns.num <- c(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                     14, 15, 100, 101, 108, 109)
  names(enwiki.ns.num) <- c( "Media", "Special", "", "Talk", "User", "User talk",
                            "Wikipedia", "Wikipedia talk","File", "File talk",
                            "MediaWiki", "MediaWiki talk", "Template", "Template talk",
                            "Help", "Help talk", "Category", "Category talk",
                            "Portal", "Portal talk", "Book","Book talk")
 }
 # function to take a list of article titles and a wiki name and return
 # a list of numbered namespaces
 titles.to.ns.num <- function (page.titles, wiki) {
    # load wikia namespace data from disk if it does not exist
    if (!exists("wikia.namespaces")) {
        wikia.namespaces <- load.wikia.namespaces()
    }
    # page.titles <- d$title # DEBUG 
    ns.df <- wikia.namespaces[wikia.namespaces$wiki == wiki,
                                c("ns.num", "ns.string")]
    namespaces <- as.character(ns.df$ns.num)
    names(namespaces) <- ns.df$ns.string
    # drop the zero, we'll deal with it later
    namespaces <- namespaces [!namespaces == 0]
    # change underscores to spaces (necessary?)
    page.titles <- gsub('_', ' ', page.titles)
    page.ns <- rep("0", length(page.titles))
    for (ns in names(namespaces)) {
        page.ns[grepl(paste('^', ns, ':', sep=""), page.titles)] <- namespaces[ns]
    }
    # return the list of namespaces as a factor
    return(as.factor(page.ns))
 }
--- a/RCommunityData/R/wikia_admin.R
+++ b/RCommunityData/R/wikia_admin.R
@@ -0,0 +1,184 @@
 # Community Data Science Collective R Utilities
 #
 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
 # mako@atdot.cc, aaronshaw@northwestern.edu
 # privileges of interest:
 # a shared variable that gets used everywhere
 generate.admin.addrm <- function (logevents, current.admins) {
  # convert types of a few variables
  logevents$ancient <- logevents$ancient == "true"
  logevents$timestamp <- timestamp.to.POSIXct(logevents$timestamp)
  logevents$rights.new[is.na(logevents$rights.new)] <- ""
  logevents$rights.old[is.na(logevents$rights.old)] <- ""
  # TODO do wikia wikis have these =?
  # in WP, all of these are negated by one day
  logevents <- logevents[!(logevents$ancient & logevents$comment == "="),]
  ##########################################
  ###  Parsing logevents file
  #########################################
  # separate out moderns & ancients and the necessary columns
  ancients <- logevents[logevents$ancient,c("title","comment","timestamp")]
  moderns <- logevents[!logevents$ancient,
                       c("title","rights.new","rights.old","timestamp")]
  # function that looks at rights.old, rights.new and returns a value of
  # privilege, add/remove, and timestamp for each user
  parse.moderns <- function (i, d) {
    user <- sub('^User:', "", d[i,"title"])
    change.time <- d[i,"timestamp"]
    rights.new <- d[i,"rights.new"]
    rights.old <- d[i,"rights.old"]
    # create a vector of new and old rights:
    destring <- function (x) { strsplit(as.character(x), ", ")[[1]] }
    # create a list of privileges that are mentioned
    privileges <- unique(c(destring(rights.new),
                           destring(rights.old)))
    # create T/F vectors incidating which privileges were added/removed
    added <- privileges[privileges %in% destring(rights.new) &
                        !(privileges %in% destring(rights.old))]
    removed <- privileges[!(privileges %in% destring(rights.new)) &
                          privileges %in% destring(rights.old)]
    # assemble the data frame of: role,action,user,timestamp
    data.frame(user=rep(user, length(c(added,removed))),
               role=c(added, removed),
               action=c(rep("added",length(added)),
                 rep("removed", length(removed))),
               timestamp=rep(change.time, length(c(added,removed))),
               era=rep("modern", length(c(added,removed))),
               stringsAsFactors=FALSE)
  }
  # if there are log events, and there are non-ancients (not all are ancients), we parse them
  if (dim(logevents)[1] & !all(logevents$ancient)) {
    moderns.parsed <- do.call("rbind",
                              lapply(1:dim(moderns)[1], parse.moderns, moderns))
  } else {
    moderns.parsed = NULL
  }
  # another function to handle processing the ancients:
  parse.ancient <- function (i, d) {
    user <- sub('^.*?:', '', d[i,"title"])
    comment <- d[i, "comment"]
    change.time <- d[i, "timestamp"]
    added <- unlist(strsplit(unlist(strsplit(comment, '(\\+|\\=)')), ', '))
    # clean any leadin, trailing whitespace
    added <- gsub("^\\s+|\\s+$", "", added)
    data.frame(user=user,
               role=added,
               action="added",
               timestamp=change.time,
               era="ancient",
               stringsAsFactors=FALSE)
  }
  # if there are any ancients, we parse them
  if (any(logevents$ancient)) {
    ancients.parsed <- do.call("rbind",
                               lapply(1:dim(ancients)[1], parse.ancient, ancients))
  } else {
    ancients.parsed = NULL
  }
  combined <- rbind(moderns.parsed, ancients.parsed)
  ##########################################
  ###  Parsing current.admins file
  #########################################
  # turn each of the columns after the first two into logical
  # function to process pre.ancients
  parse.current.admins <- function (i, d) {
    user <- d[i, "username"]
    roles <- gsub("^\\s+|\\s+$", "", strsplit(d[i, "groups"], ",")[[1]])
    o <- data.frame(user=user, role=roles, stringsAsFactors=FALSE)
    colnames(o) <- c("user", "role")
    return(o)
  }
  ## handle the case where there are no admins. This can happen on Wikipedia
  if(dim(current.admins)[1] != 0){
      current.admins.parsed <- do.call("rbind",
                                       lapply(1:dim(current.admins)[1],
                                              parse.current.admins, current.admins))
  }
  else{
      current.admins.parsed <- NULL
  }
  # select pre-ancients as people who have a given right *today* but
  # were never seen as having it added
  is.pre.ancients <- function (i, d, combined) {
    user <- d[i, "user"]
    role <- d[i, "role"]
    # look to see if we've see any events with this user and role added:
    # if we see none, this is pre-ancient
    !any(combined$user == user &
         combined$role == role &
         combined$action == "added")
  }
  if(!is.null(current.admins.parsed)){
  # create the list of pre-ancients (people role combinations we have
  # not seen in the logevents data
      pre.ancients <- current.admins.parsed[sapply(1:dim(current.admins.parsed)[1],
                                                   is.pre.ancients,
                                                   current.admins.parsed,
                                                   combined),]
  }
  else{
      pre.ancients <- NULL
  }
  # make a list of people who have been removed
  combined.removed <- combined[combined$action == "removed",]
  if (!is.null(combined.removed)) {
    if (dim(combined.removed)[1] > 0) {
      combined.removed <- combined.removed[sapply(1:dim(combined.removed)[1],
                                                  function (i,d) {
        user <- d[i,"user"]
        role <- d[i,"role"]
        timestamp <- d[i,"timestamp"]
        # was the person added before they were removed? OR in the pre-ancients
        any(combined$user == user &
            combined$role == role &
            combined$action == "added" &
            combined$timestamp <= timestamp) | (user %in% pre.ancients$user)
      }, combined.removed),c("user", "role")]
    }
  }
  pre.ancients <- rbind(pre.ancients, combined.removed)
  # give them the earliest ancient timestamp minus 1 day
  # and then add the pre.ancients to the 
  if(!is.null(pre.ancients)){
      pre.ancients$action <- "added"
      pre.ancients$timestamp <- as.POSIXct("2000-01-01 00:00:00") # min(combined$timestamp) - 60 * 1440
      pre.ancients$era <- "pre.ancient"
      combined <- rbind(combined, pre.ancients)
  }
  # remove redunandt actions
  combined <- combined[!duplicated(combined),]
  return(combined)
 }
--- a/RCommunityData/R/wikiq.R
+++ b/RCommunityData/R/wikiq.R
@@ -0,0 +1,86 @@
 # Community Data Science Collective R Utilities
 #
 # Copyright (c) 2010-2016 Benjamin Mako Hill and Aaron Shaw 
 # mako@atdot.cc, aaronshaw@northwestern.edu
 # loads simple utility functions for use in the subsequent files
 # store this for re-use across various scripts
 wikiq.header <- c("title", "articleid", "revid", "timestamp", "anon",
                  "editor", "editor_id", "minor", "text_size",
                  "text_entropy", "text_md5", "reversion",
                  "additions_size", "deletions_size", "edits",
                  "articles", "users")
 # helper function to load the TSV files our perl scripts are generating
 load.extracted.df <- function (filename) {
  read.delim(filename, header=T, quote="", na.strings="", stringsAsFactors=TRUE)
 }
 # helper function to grab the classes of all columns of a dataframe
 # keep this because it's being used but this can just be lapply(d, class)
 get.col.classes <- function (d) {
  sapply(colnames(d), function (col) { class(d[,col]) })
 }
 # convert mediawiki timestamps into POSIXct
 timestamp.to.POSIXct <- function (ts.string)  {
  ts.string <- gsub("T", " ", ts.string)
  ts.string <- gsub("Z", "", ts.string)
  return(as.POSIXct(ts.string, format="%Y-%m-%d %H:%M:%S", tz="UTC"))
 }
 read.wikiq <- function (con, header=TRUE, detect.reverts=FALSE) {
  d <- read.delim(con, stringsAsFactors=FALSE, header=header,
                  encoding="UTF-8", quote="")
  # rename date.time to timestamp and remove _
  colnames(d)[colnames(d) == "date.time"] <- "timestamp"
  colnames(d) <- sub("_", ".", colnames(d))
  d$timestamp <- as.POSIXct(sub("^(.*)y(.*)\xc8zy$", "\\1\\2",
                                d$timestamp), tz="UTC")
  # convert reversion to a logical
  d$reversion <- !is.na(d$reversion)
  if (detect.reverts) {
      # reorder so we cannow find the order and timestamp
      d <- d[order(d$title, d$timestamp),]
      # generate a list of reverted editors and a list of previous and next md5
      d$reverted <- c(d$reversion[2:length(d$reversion)],NA)
      d$md5.next <- c(d$text.md5[2:length(d$reversion)],NA)
      d$md5.prev <- c(NA,d$text.md5[1:(length(d$reversion)-1)])
      d$reverted <- d$reverted & (d$md5.next == d$md5.prev)
      # drop the extra columns and the last edit
      d <- d[!is.na(d$reverted),]
      d <- d[,!colnames(d) %in% c("md5.next", "md5.prev")]
      # create a reverted by variable by shifting up the editors and
      # then NAing nonreverts
      d$reverted.by <- c(d$editor[2:length(d$reversion)], NA)
      d$reverted.by[!d$reverted] <- NA
  }  
  # set ip address to the username and create a new variable
  d$ipaddress <- d$editor == ""
  d$editor[d$editor == ""] <- d$editor.id[d$editor == ""]
  # delete the connection
  return(d)
 }
 # TODO refactor this so that we clean the data BEFORE we read it into R
 # ATM, this is set to only work on 14 item issues
 # see the vereins wiki for "Philcomputing" and 29 lines that seem to
 # have a newline in the editor name
 read.bz.wikiq <- function (filename, header=TRUE, detect.reverts=FALSE) {
  con <- pipe(paste("bzcat", filename, "|awk -F'\t' '{if (NF == 14) print;}'"))
  d <- read.wikiq(con, header=header, detect.reverts=detect.reverts)
  rm(con)
  return(d)
 }
--- a/RCommunityData/RCommunityData.Rproj
+++ b/RCommunityData/RCommunityData.Rproj
@@ -0,0 +1,16 @@
 Version: 1.0
 RestoreWorkspace: No
 SaveWorkspace: No
 AlwaysSaveHistory: Default
 EnableCodeIndexing: Yes
 Encoding: UTF-8
 AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
 PackageRoxygenize: rd,collate,namespace
--- a/README.md
+++ b/README.md
@@ -0,0 +1,147 @@
 Copyright (C)  2018  Nathan TeBlunthuis.
 Permission is granted to copy, distribute and/or modify this document
 under the terms of the GNU Free Documentation License, Version 1.3
 or any later version published by the Free Software Foundation;
 with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
 A copy of the license is included in the file entitled "fdl-1.3.md".
 # Replication data for "Revisiting 'The Rise and Decline' in a Population of Peer Production Projects" #
 ## Overview ##
 This archive contains code and data for reproducing the analysis for
 "Replication Data for Revisiting 'The Rise and Decline' in a
 Population of Peer Production Projects". Depending on what you hope to
 do with the data you probabbly do not want to download all of the
 files. Depending on your computation resources you may not be able to
 run all stages of the analysis.
 The code for all stages of the analysis, including typesetting the
 manuscript and running the analysis, is in code.tar.
 If you only want to run the final analysis or to play with datasets
 used in the analysis of the paper, you want intermediate_data.7z or
 the uncompressed tab and csv files.
 The data files are created in a three stage process. The first stage
 uses the program "wikiq" to create tsv files that have edit data for
 each wiki. The second stage generates all.edits.RDS file which
 contains edit metadata from mediawiki xml dumps. This file is
 expensive to generate and at 1.5GB is pretty big.  The third stage
 builds smaller intermediate files that contain the analytical
 variables from these tsv files. The fourth stage uses the intermediate
 files to generate smaller RDS files that contain the results. Finally,
 knitr and latex typeset the manuscript. 
 A stage will only run if the outputs from the previous stages do not
 exist. So if the intermediate files exist they will not be
 regenerated. Only the final analysis will run. The exception is that
 stage 4, fitting models and generating plots, always runs.
 If you only want to replicate from the second stage onward, you want
 wikiq_tsvs.7z. If you want to replicate everything, you want
 wikia_mediawiki_xml_dumps.7z.001 and wikia_mediawiki_xml_dumps.7z.002.
 These instructions work backwards from building the manuscript using
 knitr, loading the datasets, running the analysis, to building the
 intermediate datasets.
 ## Building the manuscript using knitr ##
 This requires working latex, latexmk, and knitr
 installations. Depending on your operating system you might install
 these packages in different ways. On Debian Linux you can run `apt
 install r-cran-knitr latexmk texlive-latex-extra`. Alternatively, you
 can upload the necessary files to a project on Sharelatex.com or
 Overleaf.com.
 1. Download `code.tar`. This has everything you need to typeset the manuscript. 
 2. Unpack the tar archive. On a unix system this can be done by running `tar xf code.tar`.
 3. Navigate to code/paper_source.
 4. Install R dependencies. In R. run `install.packages(c("data.table","scales","ggplot2","lubridate","texreg"))`
 5. On a unix system you should be able to run `make` to build the
   manuscript `generalizable_wiki.pdf`. Otherwise you should try
   uploading all of the files (including the tables, figure, and knitr
   folders) to a new project on ShareLatex.com.
 ## Loading intermediate datasets ##
 The intermediate datasets are found in the `intermediate_data.7z`
 archive. They can be extracted on a unix system using the command `7z
 x intermediate_data.7z`. The files are 95MB uncompressed. These are
 RDS (R data set) files and can be loaded in R using the `readRDS`. For
 example `newcomer.ds <- readRDS("newcomers.RDS")`.  If you wish to
 work with these datasets using a tool other than R, you might prefer
 to work with the .tab files.
 ## Running the analysis ##
 Fitting the models may not work on machines with less than 32GB of
 RAM. If you have trouble, you may find the functions in
 lib-01-sample-datasets.R useful to create stratified samples of data
 for fitting models. See line 89 of 02_model_newcomer_survival.R for an
 example.
 1. Download `code.tar` and `intermediate_data.7z` to your working
   folder and extract both archives. On a unix system this can be done
   with the command `tar xf code.tar && 7z x intermediate_data.7z`.
 2. Install R
   dependencies. `install.packages(c("data.table","ggplot2","urltools","texreg","optimx","lme4","bootstrap","scales","effects","lubridate","devtools","roxygen2"))`.
 3. On a unix system you can simply run `regen.all.sh` to fit the
   models, build the plots and create the RDS files.
 ## Generating datasets ##
 ### Building the intermediate files ###
 The intermediate files are generated from all.edits.RDS. This process requires about 20GB of memory.
 1. Download `all.edits.RDS`, `userroles_data.7z`,`selected.wikis.csv`,
   and `code.tar`. Unpack `code.tar` and `userroles_data.7z`. On a
   unix system this can be done using `tar xf code.tar && 7z x
   userroles_data.7z`.
 2. Install R dependencies. In R run
   `install.packages(c("data.table","ggplot2","urltools","texreg","optimx","lme4","bootstrap","scales","effects","lubridate","devtools","roxygen2"))`.
 3. Run `01_build_datasets.R`.
 ### Building all.edits.RDS ###
 The intermediate RDS files used in the analysis are created from
 `all.edits.RDS`. To replicate building all.edits.RDS, you only need to
 run 01_build_datasets.R when the intermediate RDS files and
 `all.edits.RDS` files do not exist in the working
 directory. `all.edits.RDS` is generated from the tsv files generated
 by wikiq. This may take several hours. By default building the dataset
 will use all available CPU cores. If you want to change this, modify
 line 26 of `lib-01-build_newcomer_table.R`.
 1. Download selected.wikis.csv, userroles_data.7z, wikiq_tsvs.7z, and
   code.tar. Unpack the files. On a unix system this can be done by
   running `7z x userroles_data.7z && 7z x wikiq_tsvs.7z && tar xf
   code.tar`.
 2. Run `01_build_datasets.R` to generate all.edits.RDS and the intermediate files. 
 ### Running Wikiq to generate tsv files ### 
 If you want to regenerate the datasets all the way from the xml dumps
 and data from the Wikia api you will have to run the python script
 `wikiq`. This is a fairly computationally intensive process. It may
 over a day unless you can run the computations in parallel.
 1. Download `code.tar`, `wikia_mediawiki_xml_dumps.7z.001`,
   `wikia_mediawiki_xml_dumps.7z.002`, and
   `userroles_data.7z`. Extract the archives. On a Unix system this
   can be done by running `tar xf code.tar && 7z x
   wikia_mediawiki_xml_dumps.7z.001 && 7z x userroles_data.7z`.
 2. Have python3 and python3-pip installed. Using pip3 install `argparse`. `pip3 install argparse`.
 3. Edit `runwikiq.sh` to set N_THREADS. 
 4. Run `runwikiq.sh` to generate the tsv files.
 ### Obtaining Bot and Admin data from the Wikia API ###
 For the purposes of supporting an audit of our research project, this
 repository includes the code that we used to obtain Bot and Admin data
 from the Wikia API. Unfortunantly, since we ran the script, the API
 has changed and this code does not work. 
 Our research group maintains a tool for scraping the Wikia API
 available at https://code.communitydata.cc/wikia_userroles_scraper. This can
 be used to download user roles for the wikis in this dataset. Follow
 the instructions found in that package.
--- a/lib-00-utils.R
+++ b/lib-00-utils.R
@@ -0,0 +1,172 @@
 # Library containing helper functions
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library(parallel)
 library(urltools)
 library(data.table)
 library(texreg)
 ## load wikiq data for all wikis in the wiki list
 ## this wikiq data doesn't have persistent word revisions
 ## It doesn't collapse user edits either. we identify user sessions as well
 load.wikiq.file <- function(path){
    d <- fread(paste0(path),
               colClasses=list(character=c("reverteds", "date_time", "editor", "title")),
               na.string="", stringsAsFactors=TRUE, quote="",drop=c("sha1","minor"))
    gc()
    setnames(d, gsub('_', '.', colnames(d)))
    setkey(d, "revid")
    d$date.time <- as.POSIXct(as.character(d$date.time),
                              format="%Y-%m-%d %H:%M:%S",
                              tz="UTC")
    d[, ':='(editor = as.factor(url_decode(as.character(editor))), title = as.factor(url_decode(as.character(title))))]
    d[d$editor == "127.0.0.1","anon"] <- FALSE 
    # drop edits made before mediawiki was written
    d <- d[d$date.time > as.POSIXct("2002-01-22",timezone="UTC"),]
    ## drop wikia edits made after 2010-04-10, when data was collected
    if(wiki.list$wiki.type == "wikia"){
        d <- d[d$date.time < as.POSIXct("2010-04-10",timezone="UTC"),]
    }
    # created "reverted" which captures whether an edit has been identity
    # reverted within the revert RADIUS (currently 15 edits).
    if (!any(d$revert)) {
        d$reverted <- FALSE
        ## we need to reorder the columns in this case
        ## the merge in the other case also reorders columns
        setcolorder(d,c("revid",names(d)[!grepl("revid",names(d))]))
    } else {
        reverteds <- d$reverteds[d$revert]
        if (!any(grepl(",", d$reverteds))) {
            reverteds <- unique(as.integer(as.character(d$reverteds)))
        } else {
            reverteds <- unique(as.integer(unlist(strsplit(as.character(reverteds), ","))))
        }
        reverteds <- data.table(revid=reverteds, reverted=TRUE)
        d <- merge(d, reverteds, all.x=TRUE)
        d$reverted[is.na(d$reverted)] <- FALSE
    }
    # "new.id" indicates whether this is a first-time editor
    setkey(d, "date.time")
    d$new.account <- !duplicated(d$editor)
    d$new.account[is.na(d$editor)] <- FALSE
    d$total.edits <- length(d$revid)
    d$total.sessions <- seq(1, nrow(d))
    d$total.editors <- cumsum(d$new.account)
    d$total.pages <- cumsum(!duplicated(d$articleid))
    ## add the wiki name to the dt
    ## remove edits not in the namespaces we care about
    d <- d[namespace %in% c(0,1,3,4),]
    return(d)
 }
 load.wikiq.files <- function(i,wiki.list, path="wikiq_wikia_2010_all_nopersistence/"){
    wiki.filename = wiki.list[i,filename]
    wiki <- wiki.list[i,wiki]
    print(wiki)
    d <- load.wikiq.file(paste0(path,wiki.filename))
    d$wiki.name <- rep(wiki,nrow(d))
    d$wiki.type <- rep(wiki.list[i,wiki.type],nrow(d))
    d[,time.first.edit := min(date.time),by=.(editor.id, wiki.name)]
    return(d)
 }
 remember <- function (v, k, silent=FALSE) {
    if (!exists("r")){
        rfilename = "remember.RDS"
        if(file.exists(rfilename)){
            r <<- readRDS(rfilename)
        }
        else
            r <<- list()
    }
    if (missing(k)) {
        k <- deparse(substitute(v))
    }
    ## save to the global r variable/list
    r[[k]] <<- v
    if (!silent) {
        print(r[[k]])
        flush.console()
    }
    invisible(r[[k]])
    ## return(r[[k]])
    saveRDS(r,"remember.RDS")
 }
 ## make sure that appendix and nosave are always defined
 if (!exists("appendix")) { appendix <- FALSE }
 if (!exists("nosave")) { nosave <- FALSE }
 if(!exists("plot.distribtuions")){plot.distributions <- FALSE}
 basedir <- "."
 setwd(basedir)
 include.wikipedia <- FALSE
 if (!exists("wiki.list")) {
    subdir <- "userroles_data/"
    if (!exists(paste0(subdir,"missing.wikis"))){
        deleted.wikis <- fread(paste0(subdir,"allusers_deleted_merge.txt"),header=FALSE,col.names=c("wiki"))
        deleted.wikis <- unique(deleted.wikis$wiki)
        notauthorized.wikis <- fread(paste0(subdir,"allusers_notauthorized_merge.txt"),header=FALSE,col.names=c("wiki"))
        notauthorized.wikis <- unique(notauthorized.wikis$wiki)
        missing.wikis = c(deleted.wikis, notauthorized.wikis)
        remember(deleted.wikis)
        remember(notauthorized.wikis)
    }
    wiki.list <- fread("selected.wikis.csv")
    wiki.list <- wiki.list[! (wiki %in% missing.wikis) ]
    wiki.list[wiki.type=="wikia",filename:=paste0(wiki,".tsv")]
    if(include.wikipedia){
        matchidx <- wiki.list[wiki.type=="wikipedia",regexec("https://(.*)\\.wikipedia.org",url)]
        lang <- sapply(regmatches(wiki.list[wiki.type=="wikipedia",url],matchidx),function (l) l[2])
        lang <- gsub("-","_",lang)
        wiki.list[wiki.type=="wikipedia",lang := lang]
        wiki.list[wiki.type=="wikipedia",filename:=paste0(lang,"_wikipedia.tsv")]
    }
    else{
        wiki.list <- wiki.list[wiki.type != "wikipedia"]
    }
 #       wiki.list[,lang := NULL]
    rm(missing.wikis)
 }
 if (!file.exists("wikis.used")){
    write(wiki.list$wiki,"wikis.used")
 }
 options(mc.cores = 16)
--- a/lib-01-build_newcomer_table.R
+++ b/lib-01-build_newcomer_table.R
@@ -0,0 +1,845 @@
 # Library containing code for processing wikiq tsvs into datasets
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library(urltools)
 library(lubridate)
 ### is it more efficient to develop inside the loop or outside?
 ## with group by outside mclapply
 ##  user  system elapsed 
 ## 3.743   8.112   6.219 
 ##    user  system elapsed 
 ## 609.715 592.603 638.172 
 ## with group by inside mclapply
 ##  user  system elapsed 
 ## 3.670  8.302   5.780 
 ##    user  system elapsed 
 ## 739.826 408.396 596.346 
 ## conclusion: do as much outside mclapply as possible
 build.newcomer.table.step1 <- function(wiki.list,
                                       session.window.length = duration(1,units="hours"),
                                       newcomer.period = duration(2*30,units="days"),
                                       newcomer.sunset = duration(180,units="days"),
                                       n.early.period.sessions = 1){
    d.list <- mclapply(1:nrow(wiki.list),load.wikiq.files,wiki.list=wiki.list,mc.preschedule=F)
 #    d.list <- lapply(1:nrow(wiki.list),wiki.list=wiki.list,load.wikiq.files)
    all.edits <- rbindlist(d.list)
    all.edits[,
              ":="(time.first.edit = min(date.time),
                   time.last.edit = max(date.time)),
              by=.(editor.id, wiki.name)]
    all.edits[,
              ":="(editor=gsub("\"","",editor),
                   title=gsub("\"","",title),
                   reverteds=gsub("\"","",reverteds))]
    all.edits <- all.edits[editor != "Default"]
    all.edits[,month:=floor_date(date.time,unit="month")]
    all.edits[,,by=.(wiki.name,editor)]
    setkey(all.edits,wiki.name,editor.id,date.time)
    ## fix the definition of session to edits that have less than 1 hour together
    all.edits[,":="(time.since.last.edit = diff(c(first(time.first.edit),date.time),lag=1,differences=1),
                    time.till.next.edit = diff(c(date.time,last(time.last.edit))),lag=1,differences=1,
                    editor.tenure =as.duration(max(date.time)-min(date.time))), 
              by=.(editor.id,wiki.name)]
    all.edits[,":="(new.session = time.since.last.edit > session.window.length),by=.(editor.id,wiki.name)]
    all.edits[,":="(nth.session = cumsum(new.session)),by=.(editor.id,wiki.name)]
    all.edits[,":="(in.early.session = nth.session < n.early.period.sessions)]
    all.edits[,
              ":="(is.reverted = any(reverted),
                   is.deleted = any(deleted),
                   p.reverted = mean(reverted & namespace ==0),
                   n.first.session=nrow(.SD[in.early.session==TRUE])),
              by=.(editor.id,wiki.name)]
    all.edits[,":="(age = as.duration(date.time - time.first.edit))]
    all.edits[,":="(last.wiki.edit = max(date.time)),by=.(wiki.name)]
    all.edits[,":="(is.newcomer = (age < newcomer.period) & (as.duration(last.wiki.edit - time.first.edit) > as.duration(newcomer.sunset)) & !anon)]
    ## did rejecting editors leave a comment on the talk page?
    return(all.edits)
 }
 add.userroles <- function(all.edits,bots,admins){
    bots[,":="(wiki.name = wiki,
               editor = user
               ),
         by=.(wiki,user)
         ]
    admins[,":="(wiki.name = wiki,
                 editor = user),
           by=.(wiki,user)]
    all.edits[bots,
              ":="(
                  is.bot = i.is.bot
              ),
                  on=.(wiki.name,
                       editor,
                       date.time >= role.period.begin,
                       date.time <= role.period.end)
              ]
    all.edits[admins,
              ":="(
                  is.admin = i.is.admin
              ),
                  on=.(wiki.name,
                       editor,
                       date.time >= role.period.begin,
                       date.time <= role.period.end)
              ]
    all.edits[,":="(is.bot = ifelse(is.na(is.bot),FALSE,is.bot),
                    is.admin = ifelse(is.na(is.admin),FALSE,is.admin))]
    all.edits[,":="(is.newcomer = (is.newcomer & !is.bot))]
    return(all.edits)
 }
 identify.revert.messages <- function(all.edits, discussion.window = as.difftime(7,units="days"),week.length=as.difftime(7,units="days")){
    all.edits[,user.talk:=as.factor(paste0("User talk:",as.character(all.edits$editor)))]
    ## join the talk page edits wit
    all.edits[namespace==0,talk:=as.factor(paste0("Talk:",as.character(all.edits[namespace==0]$title)))]
    print("    identifying reverts")
    all.edits[!is.na(reverteds),reverted.edits := lapply(strsplit(reverteds,","),strtoi)]
    all.edits[!is.na(reverteds),N.reverteds := lapply(reverted.edits,length)]
    ns.edits = all.edits[namespace==0 | namespace==4]
    reverted.lookup <- ns.edits[!is.na(reverteds),
                                 .(revid = unlist(reverted.edits),
                                   wiki.name = rep(wiki.name,N.reverteds),
                                   reverted.by = rep(editor,N.reverteds),
                                   reverted.by.bot = rep(is.bot, N.reverteds),
                                   reverted.by.admin = rep(is.admin, N.reverteds),
                                   revert.date.time = rep(date.time,N.reverteds),
                                   revert.id = rep(revid,N.reverteds))]
    reverted.edits <- ns.edits[reverted==TRUE]
    reverted.edits[reverted.lookup,
                   ":="(reverted.by = i.reverted.by,
                        reverted.by.bot = i.reverted.by.bot,
                        reverted.by.admin = i.reverted.by.admin,
                        revert.date.time = i.revert.date.time,
                        revert.id = revert.id),
                   on=.(wiki.name,revid)]
    reverted.edits[,message.window.end:= revert.date.time + discussion.window]
    ## merge back revert info to all.edits
    all.edits[reverted.edits,":="(
                                 reverted.by = i.reverted.by,
                                 reverted.by.bot = i.reverted.by.bot,
                                 reverted.by.admin = i.reverted.by.admin,
                                 revert.date.time = i.revert.date.time,
                                 revert.id = revert.id,
                                 message.window.end = message.window.end),
              on = .(wiki.name, revid)]
    print("    done")
    print("    identifying editor talk page edits")
    ns0.edits = all.edits[namespace==0]
    ## we want talkers who talk before the end of the window
    talk.page.edits = all.edits[namespace==1]
    talk.page.edits[,talk:=title]
    ## we only need to keep the key identifier for each revert
    ## use editor + title instead of revid since editors may have more than
    ## one edit reverted by a given revert.id. 
    ## key = wiki.name,editor,title,revert.id,
    setkeyv(reverted.edits,c("wiki.name","editor","title","revert.id"))
    ## condition where editor discusses after being reverted
    editor.talks <- reverted.edits[talk.page.edits,
                                   .(
                                       wiki.name,
                                       editor = x.editor,
                                       revert.id = x.revert.id,
                                       talk.id = i.revid,
                                       talk.date.time=i.date.time
                                   )
                                  ,on=.(editor,
                                        wiki.name,
                                        talk,
                                        revert.date.time<date.time,
                                        message.window.end>=date.time)
                                  ,nomatch=0L]
    editor.talks <- editor.talks[,
                                 .(
                                     editor.talks = TRUE,
                                     time.editor.talks = min(talk.date.time),
                                     editor.talks.revid = min(talk.id)
                                 ),
                                 by = .(wiki.name,editor,revert.id)
                                 ]
    ## merge back reverted edits to all.edits
    all.edits[editor.talks,
              ":="(editor.talks = editor.talks,
                   time.editor.talks = time.editor.talks,
                   editor.talks.revid=editor.talks.revid),
              on=.(wiki.name,editor,revert.id)]
    ## tidy up
    rm(editor.talks, reverted.lookup)
    print("    done")
    print("    identifying reverter talk page edits")
    all.edits[,":="(response.window.end = time.editor.talks + discussion.window)]
    all.edits[(reverted==TRUE & is.na(editor.talks)), editor.talks := FALSE]
    ns0.edits = all.edits[namespace==0]
    reverted.edits <- ns0.edits[reverted==TRUE]
    talk.page.edits <- all.edits[namespace==1]
    talk.page.edits[,":="(talk = title,reverted.by=editor)]
                                        # the key is still wiki.name, editor, revert.id
    reverter.talks <- reverted.edits[talk.page.edits,
                                     .(
                                         wiki.name = wiki.name,
                                         editor = x.editor,
                                         revert.id = x.revert.id,
                                         revert.date.time = x.revert.date.time,
                                         time.reverter.talks = i.date.time,
                                         reverter.talk.id = i.revid
                                     ),
                                    ,on=.(reverted.by,
                                          wiki.name,  
                                          talk,
                                          revert.date.time<date.time,
                                          response.window.end>=date.time),
                                     nomatch=0L]
    reverter.talks <- reverter.talks[time.reverter.talks > revert.date.time,
                                     .(   
                                         reverter.talks = TRUE,
                                         time.reverter.talks = min(time.reverter.talks),
                                         reverter.talk.id = min(reverter.talk.id)
                                     ),
                                     by=.(wiki.name,editor,revert.id)
                                     ]
    ## merge back reverted.edits to all.edits
    all.edits[reverter.talks,
              ":="(reverter.talks = reverter.talks,
                   time.reverter.talks = time.reverter.talks,
                   reverter.talk.id = reverter.talk.id),
              on=.(wiki.name,editor,revert.id)]
    ## tidy up
    rm(reverter.talks,talk.page.edits)
    all.edits[(reverted == TRUE) & (is.na(reverter.talks)), reverter.talks := FALSE]
                                        # if the editor didn't talk first, the time window is different
    all.edits[reverter.talks == TRUE,
              editor.talks.first := (time.editor.talks < time.reverter.talks)]
    all.edits[(reverter.talks == TRUE) & (editor.talks.first==FALSE),
              reverter.talks := time.reverter.talks < (date.time + discussion.window)]
    print("    done")
    print("    identifying User talk page edits")
    ## now do the same thing but for user talk pages
    ## did the reverter post on the editor's user talk page?
    ## key is wiki.name, title, reverted.by, revert.id
    ns0.edits = all.edits[namespace==0]
    user.talk.edits = all.edits[namespace==3]
    user.talk.edits[,":="(reverted.by=editor,user.talk=title)]
    reverted.edits = ns0.edits[reverted==TRUE]
    reverter.messages = reverted.edits[user.talk.edits,
                                       .(wiki.name = x.wiki.name,
                                         title = x.title,
                                         revert.id = x.revert.id,
                                         editor = x.editor,
                                         reverted.by = i.reverted.by,
                                         time.reverter.messages=i.date.time,
                                         reverter.messages.id=i.revid),
                                       on=.(wiki.name,
                                            reverted.by,
                                            user.talk,
                                            revert.date.time <= date.time,
                                            message.window.end >= date.time
                                            ),
                                       nomatch=0L]
    reverter.messages = reverter.messages[,.(reverter.messages = TRUE,
                                             time.reverter.messages = min(time.reverter.messages),
                                             reverter.message.id = min(reverter.messages.id)),
                                          by=.(wiki.name, editor, reverted.by, revert.id)]
    reverted.edits[reverter.messages,":="(reverter.messages = reverter.messages,
                                          time.reverter.messages = time.reverter.messages,
                                          reverter.message.id = reverter.message.id),
                   on=.(wiki.name, editor, revert.id)]
    reverted.edits[is.na(reverter.messages), reverter.messages := FALSE]
    all.edits[reverted.edits,":="(reverter.messages = reverter.messages,
                                  time.reverter.messages = time.reverter.messages,
                                  reverter.message.id = reverter.message.id),
              on=.(wiki.name, editor, revert.id)]
    ## set some wiki-level variables
    print("    creating wiki windows")
    setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
    all.edits[,":="(chars.change = diff(c(0L,text.chars),lag=1,differences=1),
                    creates.article = (date.time == min(date.time))
                    ),by=.(wiki.name,articleid)]
    setorderv(all.edits,cols=c("wiki.name","date.time","articleid"),order=1L)
    # Some wikis got created by Wikia - invalidating wiki age that doesn't remove this editor
    all.edits[,":="(wiki.birth.date = min(date.time)),by=.(wiki.name)]
    all.edits[,":="(total.wiki.length = cumsum(chars.change),
                    n.articles = cumsum(creates.article),
                    wiki.age = as.duration(date.time - wiki.birth.date),
                    year = year(date.time)
                    ),by=.(wiki.name)]
    all.edits[,":="(wiki.age.months = floor(as.double(wiki.age,units='days')/30),
                    wiki.age.years = floor(as.double(wiki.age,units='years')))]
    ## generate breaks at precisely 1 week +/- the first edit.
    date.range <- all.edits[,.(first.edit = min(date.time),last.edit = max(date.time)), by = .(wiki.name)]
    window.breaks <- date.range[,.(breaks = seq(trunc(first.edit,"days"),
                                                trunc(last.edit,"days"),
                                                by=week.length),
                                   break.next = seq(trunc(first.edit+week.length,"days"),
                                                    trunc(last.edit+week.length,"days"),
                                                    by=week.length)),
                                by=.(wiki.name)]
    window.breaks[,
                  ":="(i.break = 1:length(breaks))
                 ,by=(wiki.name)]
    all.edits[window.breaks,
              ":="(week = i.break
                   ),
              on=.(wiki.name, date.time <=break.next,date.time >=breaks)]
    print("   done")
    ## tidy up 
    all.edits[,":="(reverted.edits = NULL,
                    N.reverteds = NULL,
                    user = NULL,
                    user.talk = NULL,
                    talk=NULL,
                    message.window.end=NULL,
                    response.window.end=NULL)]
    print("    done")
    rm(reverted.edits,reverter.messages,user.talk.edits,ns0.edits)
    return(all.edits)
 }
 build.newcomers <- function(all.edits,
                            newcomer.period = duration(60,unit="days"),
                            newcomer.sunset= duration(30*6,unit="days")
                            ){
    setkeyv(all.edits,'date.time')
    all.edits[,":="(time.last.edit.to.wiki = max(date.time)), by=.(wiki.name)]
    all.edits <- all.edits[,time.till.page.edit := c(diff(date.time),as.numeric(NA)),by=.(wiki.name,articleid)]
    all.edits <- all.edits[,last.edit.to.page :=is.na(time.till.page.edit)]
    all.edits[last.edit.to.page == TRUE,time.till.page.edit := time.last.edit.to.wiki-date.time]
    all.edits <- all.edits[,time.till.page.edit := log1p(as.numeric(time.till.page.edit,units='days'))]
    editor.variables <- all.edits[,
                                  .(survives = any( (age > newcomer.period) & (age < newcomer.sunset)),anon=first(anon),is.bot=any(is.bot),is.admin=any(is.admin)),
                                  by = .(wiki.name,editor)
                                  ]
    first.session.edits <- all.edits[in.early.session==TRUE]
    first.session.edits[,":="(end.newcomer.period = time.first.edit + newcomer.period)]
    print("    aggregating newcomer activity within wikis")
    newcomers <- first.session.edits[namespace == 0,
                                .(
                                   is.reverted = any(reverted & reverted.by != editor),
                                   p.reverted = first(p.reverted),
                                   is.bot.reverted = any(reverted.by.bot),
                                   is.admin.reverted = any(reverted.by.admin),
                                   is.reverted.messaged = any(reverter.messages |
                                                              reverter.talks,na.rm=TRUE),
                                   reverter.talks = any(reverter.talks, na.rm=TRUE),
                                   reverter.messages = any(reverter.messages, na.rm=TRUE),
                                   editor.talks = any(editor.talks,na.rm=TRUE),
                                   time.next.page.edit = min(time.till.next.edit, na.rm=TRUE),
                                   BRD.initiation = any(editor.talks &
                                                        (editor.talks.first |
                                                         !reverter.talks), na.rm = TRUE),
                                   BRD.reciprocation = any(editor.talks &
                                                           editor.talks.first &
                                                           reverter.talks, na.rm = TRUE),
                                   reverter.initates.BRD = any(reverter.talks & (!editor.talks.first |
                                                                                 is.na(editor.talks.first)),na.rm=TRUE),
                                   time.first.edit = first(time.first.edit),
                                   time.till.page.edit = min(time.till.page.edit),
                                   last.edit.to.page = all(last.edit.to.page),
                                   end.newcomer.period = first(end.newcomer.period),
                                   week = first(week),
                                   year = first(year(time.first.edit)),
                                   newcomer.edits = .N,
                                   session.edits = first(n.first.session),
                                   ns0.edits = sum(namespace == 0),
                                   ns1.edits = sum(namespace == 1),
                                   ns4.edits = sum(namespace == 4),
                                   newcomer.chars.change = sum(chars.change),
                                   newcomer.creates.article = any(creates.article),
                                   wiki.type = first(wiki.type),
                                   wiki.age = first(wiki.age)
                                   ),
                                by = .(wiki.name, editor)
                                ]
    newcomers[editor.variables,":="(survives = survives,is.bot=is.bot,is.admin=is.admin), on=.(wiki.name,editor)]
    newcomers <- newcomers[!is.bot & !is.admin]    
    print("    done")
    print("    identifying newcomer activity on other wikis")
    newcomer.prior.wikis <- first.session.edits[newcomers,
                                           .(
                                               editor = editor,
                                               wiki.name = i.wiki.name,
                                               other.wiki = x.wiki.name,
                                               time.first.edit.this = i.time.first.edit,
                                               time.first.edit.other = x.time.first.edit
                                           ),
                                           on=.(wiki.type,editor,time.first.edit < time.first.edit),
                                           nomatch=0L,
                                           allow.cartesian = TRUE
                                           ]
    # using < time first edit should exlude edits to this wiki
    newcomer.prior.wikis <- newcomer.prior.wikis[,.(n.edits.other = .N),
                                                 by=.(editor,wiki.name,other.wiki)]
    newcomer.prior.wikis <- newcomer.prior.wikis[,
                                                 .(n.other.wikis = .N,
                                                   n.edits.other = sum(n.edits.other)),
                                                 by=.(wiki.name,editor)]
    newcomer.prior.wikis <- newcomer.prior.wikis[newcomers,
                                                 .(
                                                     wiki.name=wiki.name,
                                                     editor=editor,
                                                     n.other.wikis = n.other.wikis,
                                                     n.edits.other = n.edits.other,
                                                     has.edited.other.wikis = (n.other.wikis > 0) & (!is.na(n.other.wikis))),
                                                 on=.(wiki.name,editor),
                                                 nomatch=NA]
    newcomers <- newcomers[newcomer.prior.wikis,
                           ":="(n.other.wikis = ifelse(is.na(i.n.other.wikis),0,i.n.other.wikis),
                                n.edits.other = ifelse(is.na(i.n.edits.other),0,i.n.edits.other),
                                has.edited.other.wikis = (i.n.other.wikis > 0) & (!is.na(i.n.other.wikis))),
                           on=.(wiki.name, editor)
                           ]
    newcomers[,":="(has.edited.other.wikis = ifelse(is.na(has.edited.other.wikis),FALSE,has.edited.other.wikis),
                    n.edits.other = ifelse(is.na(n.edits.other),0,n.edits.other),
                    n.other.wikis = ifelse(is.na(n.other.wikis),0,n.other.wikis)
                   )]
    print("    done")
    print("    identifying all messages")
    user.talk.edits <- all.edits[namespace==3]
    user.talk.edits[,user.talk:=title]
    newcomers[,user.talk:= as.factor(paste0("User talk:",as.character(editor)))]
    newcomer.messages <- user.talk.edits[newcomers,
                                        .(
                                          editor = i.editor,
                                          n.messages = .N,
                                          end.newcomer.period = i.end.newcomer.period
                                          ),
                                        on=.(wiki.name,user.talk,date.time <= end.newcomer.period),
                                        by=.EACHI,
                                       nomatch=0L]
    newcomer.messages <- newcomer.messages[newcomers,
                                   .(wiki.name,
                                     editor,
                                     n.messages = x.n.messages,
                   is.messaged = (x.n.messages > 0) & (!is.na(x.n.messages))),
              on=.(wiki.name,editor),
              nomatch = NA]
    newcomers <- newcomers[newcomer.messages,
                           ":="(n.messages = ifelse(is.na(i.n.messages),0L,i.n.messages),
                                is.messaged = ifelse(is.na(i.n.messages),FALSE,i.is.messaged)),
                           on=.(wiki.name,editor)]
    last.edit <- max(all.edits$date.time)
    last.wikia.edit <- max(all.edits[wiki.type=="wikia",date.time])
    newcomers <- newcomers[time.first.edit < last.edit - as.difftime(60,units="days")]
    newcomers <- newcomers[(wiki.type == "wikia") & (time.first.edit < (last.wikia.edit - as.difftime(60,units="days")))]
    print("    done")
    return(newcomers)
 }
 build.namespace4.dataset <- function(all.edits,  week.length = as.difftime(7,units="days")){
    ns4.reg.edits <- all.edits[(namespace==4) & (anon==FALSE)]
    return(ns4.reg.edits)    
 }
 build.wiki.level.variables <- function(all.edits, week.length = as.difftime(7,units="days")){
    wiki.data <- all.edits[,.(n.editors = length(unique(editor)),
                              total.wiki.length=last(total.wiki.length)
                              )
                           ,by=.(wiki.name,week)]
    wiki.ns4.data <- all.edits[namespace==4,
                               .(n.ns4.edits = .N,
                                 n.ns4.editors = length(unique(editor)),
                                 d.ns4.length = sum(chars.change),
                                 ns4.editor.age = mean(age)
                                 ),
                               by=.(wiki.name, week)]
    wiki.ns0.data <- all.edits[namespace==0,
                               .(revert.rate = mean(reverted,na.rm=TRUE),
                                 newcomer.revert.rate = sum((reverted & is.newcomer),na.rm=TRUE)/sum(is.newcomer,na.rm=TRUE),
                                 revert.disc.rate = sum((reverted  & reverter.talks),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
                                 newcomer.revert.disc.rate = sum((reverted & reverter.talks & is.newcomer),na.rm=TRUE)/ sum(reverted & is.newcomer,na.rm=TRUE),
                                 revert.message.rate = sum((reverted & reverter.messages),na.rm=TRUE)/sum(reverted,na.rm=TRUE),
                                 newcomer.revert.message.rate = sum((reverted & reverter.messages & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
                                 newcomer.edits.rate = mean(is.newcomer,na.rm=TRUE),
                                 bot.revert.rate = mean(reverted.by.bot,na.rm=TRUE),
                                 bot.revert.prop = sum(reverted.by.bot,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
                                 newcomer.bot.revert.rate = mean((reverted.by.bot & is.newcomer),na.rm=TRUE), 
                                 newcomer.bot.revert.prop = sum((reverted.by.bot & is.newcomer),na.rm=TRUE)/sum((reverted & is.newcomer),na.rm=TRUE),
                                 admin.revert.rate = mean(reverted.by.admin,na.rm=TRUE),
                                 admin.revert.prop = sum(reverted.by.admin,na.rm=TRUE)/sum(reverted,na.rm=TRUE),
                               year = year(first(date.time)),
                               month = month(first(date.time))),
                               by=.(wiki.name,week)]
    ## replace NAs with 0
    wiki.ns0.data[,
                  ":="(
 #                      revert.rate = ifelse(is.na(revert.rate),0,revert.rate),
                      revert.disc.rate = ifelse(is.na(revert.disc.rate),0,revert.disc.rate),
                      newcomer.revert.disc.rate = ifelse(is.na(newcomer.revert.disc.rate),0,newcomer.revert.disc.rate),
                      revert.message.rate = ifelse(is.na(revert.message.rate),0,revert.message.rate),
                      newcomer.revert.message.rate = ifelse(is.na(newcomer.revert.message.rate),0,newcomer.revert.message.rate),
                      newcomer.edits.rate = ifelse(is.na(newcomer.edits.rate),0,newcomer.edits.rate),
                      bot.revert.rate = ifelse(is.na(bot.revert.rate),0,bot.revert.rate),
                      bot.revert.prop = ifelse(is.na(bot.revert.prop),0,bot.revert.prop),
                      newcomer.bot.revert.rate = ifelse(is.na(newcomer.bot.revert.rate),0,newcomer.bot.revert.rate),
                      newcomer.bot.revert.prop = ifelse(is.na(newcomer.bot.revert.prop),0,newcomer.bot.revert.prop),
                      admin.revert.rate = ifelse(is.na(admin.revert.rate),0,admin.revert.rate),
                      admin.revert.prop = ifelse(is.na(admin.revert.prop),0,admin.revert.prop)),
                  ]
    ## bring it together
    wiki.data[wiki.ns0.data,
              ":="(
                  revert.rate = i.revert.rate,
                  revert.disc.rate = i.revert.disc.rate,
                  newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
                  revert.message.rate = i.revert.message.rate,
                  newcomer.revert.message.rate = i.newcomer.revert.message.rate,
                  newcomer.edits.rate = i.newcomer.edits.rate,
                  bot.revert.rate = i.bot.revert.rate,
                  bot.revert.prop = i.bot.revert.prop,
                  newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
                  newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,
                  admin.revert.rate = i.admin.revert.rate,
                  admin.revert.prop = i.admin.revert.prop),
              on=.(wiki.name,week)]
    wiki.data[wiki.ns4.data,
              ":="(
                  n.ns4.edits = i.n.ns4.edits,
                  n.ns4.editors = i.n.ns4.editors,
                  d.ns4.length = i.d.ns4.length,
                  ns4.editor.age = i.ns4.editor.age
              ),
              on=.(wiki.name,week)]
    # create variables for community size in standard deviation units
    return(wiki.data)
 }
 load.all.edits <- function(){
    if(!exists("all.edits")){
        file.name <- "all.edits.RDS"
        if(!file.exists(file.name)){
            print("loading wikiq data")
            all.edits <- build.newcomer.table.step1(wiki.list, newcomer.period = newcomer.period)
            print("done")
            print("adding user role data")
            all.edits <- add.userroles(all.edits,bots=bots,admins=admins)
            print("done")
            print("identifying reverts and messages")
            all.edits <- identify.revert.messages(all.edits,week.length=as.difftime(7,units="days"))
            print("done")
            if(!nosave){
                print("saving work")
                saveRDS(all.edits,file.name)
                print("done")
            }
        } else{
            print("loading wikiq data with reverts and messages")
            all.edits <- readRDS(file.name)
            print("done")
        }
        remember(min(all.edits$date.time),"earliest.data.point")
        remember(max(all.edits$date.time),"latest.data.point")
        ## make all.edits a global variable
        all.edits <<- all.edits
    }
 }
 newcomer.period = duration(2*30,unit="days")
 newcomer.sunset = duration(30*6,unit="days")
 week.length=duration(7,unit="days")
 remember(newcomer.period)
 remember(newcomer.sunset)
 remember(week.length)
 ## try loading newcomers
 if(!exists("newcomers")){
    file.name2 <- "newcomers.RDS"
    if(file.exists(file.name2)){
        newcomers <- readRDS(file.name2)            
    } else{
        print("building newcomers table")
        load.all.edits()
        newcomers <- build.newcomers(all.edits,
                                     newcomer.sunset = newcomer.sunset,
                                     newcomer.period=newcomer.period)
        print("done")
        print("saving work")
        if(!nosave){
            saveRDS(newcomers,file.name2)
        }
    }
 }    
 if(!exists("ns4.reg.edits")){
    file.name <- "ns4.reg.edits.RDS"
    if(file.exists(file.name)){
        ns4.reg.edits <- readRDS(file.name)            
    } else{
        print("building ns4 edits table")
        ## create table of namespace 4 edits from all edits
        load.all.edits()
        ns4.reg.edits <- build.namespace4.dataset(all.edits)
        print("done")
        print("saving work")
        if(!nosave){
            saveRDS(ns4.reg.edits,file.name)
        }
    }
 }    
 if(!exists("wiki.data")){
    file.name3 <- "wikiweeks.RDS"
    if(!file.exists(file.name3)){
        print("building wiki level variable")
        load.all.edits()
        wiki.data <- build.wiki.level.variables(all.edits, week.length=week.length)
        print("done")
        print("saving work")
        if(!nosave){
            saveRDS(wiki.data,file.name3)
        }
        print("done")
    }
    else{
        wiki.data <- readRDS(file.name3)
    }
 }
 #wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
 #remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
 #newcomers <- newcomers[!(wiki.name  %in% wikis.to.remove)]
 #all.edits <- all.edits[!(wiki.name %in% wikis.to.remove)]
 if(!exists("wiki.stats")){
    file.name <- "wiki.stats.RDS"
    if(!file.exists(file.name)){
        load.all.edits()
        editor.tenures <- all.edits[,.(tenure=first(editor.tenure)),by=.(wiki.name,editor)]
        wiki.stats <- all.edits[,.(total.editors = length(unique(editor)),
                                   total.edits = .N,
                                   total.reverts = sum(reverted),
                                   total.bot.reverts = sum(reverted.by.bot,na.rm=TRUE),
                                   total.ns4.edits = nrow(.SD[namespace==4]),
                                   med.edit.tenure = median(editor.tenure)
                                   ),by=.(wiki.name)]
        med.editor.tenure <- editor.tenures[,.(med.editor.tenure=median(tenure)),by=.(wiki.name)]
        wiki.stats[med.editor.tenure,med.tenure := med.editor.tenure,on="wiki.name"]
        newcomer.stats <- newcomers[,.(retention.rate = mean(survives),
                                       reverted.newcomers = sum(is.reverted)
                                       ),by=.(wiki.name)]
        wiki.stats <- wiki.stats[newcomer.stats,':='(retention.rate = retention.rate, reverted.newcomers = reverted.newcomers), on="wiki.name"]
        remember(wiki.stats,silent=TRUE)
        saveRDS(wiki.stats,file.name)
    } else {
        wiki.stats <- readRDS("wiki.stats.RDS")
    }
 }
 row1 <- c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits")
 row2 <- c("med.editor.tenure","retention.rate")
 m.wiki.stats <- melt(wiki.stats,id='wiki.name',measure.vars = c("total.editors","total.reverts","total.bot.reverts","total.ns4.edits"))
 m.wiki.stats[variable %in% row1, ":="(row = 1,col=which(row1 == variable,useNames=F)),by=variable]
 m.wiki.stats[variable %in% row2, ":="(row = 2,col=which(row2 == variable,useNames=F)),by=variable]
 m.wiki.stats <- m.wiki.stats[value != 0 | variable != "total.bot.reverts"]
 m.wiki.stats <- m.wiki.stats[value == 0 & variable != "total.bot.reverts", value := 1]
 friendly.var <- function(varname){
    sapply(as.character(varname),function(f) switch(f,
                                                    total.editors='Editors',
                                                    total.reverts='Reverts',
                                                    total.bot.reverts='Bot reverts',
                                                    total.ns4.edits='Edits to the project namespace'))
 }
 var.id <- function(varname){
    sapply(as.character(varname),function(f) switch(f,
                                                    total.editors=1,
                                                    total.reverts=2,
                                                    total.bot.reverts=3,
                                                    total.ns4.edits=4))
 } 
 med.line.width <- 1
 m.wiki.stats[,variable := friendly.var(variable)]
 m.wiki.stats <- m.wiki.stats[,variable:=factor(variable,levels=c('Editors',"Reverts","Bot reverts","Edits to the project namespace"))]
 spoke.data <- m.wiki.stats[,.(y = median(value)),by=variable]
 remember(m.wiki.stats)
 remember(spoke.data)
 remember(nrow(wiki.stats),"n.wikia.wikis")
 ## join wiki-level variables with newcomer variables to get ready to model newcomer retention.
 newcomers <- newcomers[wiki.data,
          ":="(
              wiki.name=i.wiki.name,
              week = i.week,
              n.editors = i.n.editors,
              total.wiki.length = i.total.wiki.length,           
              revert.rate = i.revert.rate,
              revert.disc.rate = i.revert.disc.rate,            
              newcomer.revert.disc.rate = i.newcomer.revert.disc.rate,
              revert.message.rate = i.revert.message.rate,         
              newcomer.revert.message.rate = i.newcomer.revert.message.rate,
              newcomer.edits.rate = i.newcomer.edits.rate,         
              bot.revert.rate = i.bot.revert.rate,
              bot.revert.prop = i.bot.revert.prop,             
              newcomer.bot.revert.rate = i.newcomer.bot.revert.rate,
              newcomer.bot.revert.prop = i.newcomer.bot.revert.prop,    
              admin.revert.rate = i.admin.revert.rate,
              admin.revert.prop = i.admin.revert.prop,           
              n.ns4.edits = i.n.ns4.edits,
              n.ns4.editors = i.n.ns4.editors,               
              d.ns4.length = i.d.ns4.length,
              ns4.editor.age = i.ns4.editor.age,
              wiki.age.weeks = as.double(wiki.age,units='days')/7,
              wiki.age.months = floor(as.double(wiki.age,units='days')/30),
              wiki.age.half.years = floor(as.double(wiki.age,units='years')*2),
              wiki.age.years = floor(as.double(wiki.age,units='years')),
              quarter = factor(floor_date(time.first.edit,unit="3 months"))
          ),
          on=.(wiki.name,week)
          ]
 survival.data <- newcomers[,.(wiki.name,
                              week,
                              survival.rate = mean(survives),
                              n.newcomers = .N),
                           by = .(wiki.name, week)]
 wiki.data <- wiki.data[survival.data,
          ":="(
              survival.rate = survival.rate,
              n.newcomers =  n.newcomers),
          on = .(wiki.name,week)]
 file.name <- "active.editors.RDS"
 if(!file.exists(file.name)){
    load.all.edits()
    active.editors <- all.edits[,
                                .(N.edits=.N,
                                  wiki.age.years=first(wiki.age.years)),
                                by=.(wiki.name,
                                     editor,
                                     wiki.age.months)]
    saveRDS(active.editors, file.name)
 } else {
    active.editors <- readRDS(file.name)
 }
--- a/lib-01-generate_userroles.R
+++ b/lib-01-generate_userroles.R
@@ -0,0 +1,85 @@
 # Processes data from the Wikia API to identify bots and admins
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 library(devtools)
 load_all("RCommunityData")
 # Get files for a wiki
 load.rights.files <- function (filename) {
  wiki <- gsub('\\.tsv$', '', filename)
  print(wiki)
  logevents <- read.delim(paste("logevents-2017/", filename, sep=""),
                          stringsAsFactors=FALSE, encoding="UTF-8", quote="")
  current.userroles <- read.delim(paste("userlist-2017/", filename, sep=""),
                               stringsAsFactors=FALSE, na.string="",
                               encoding="UTF-8", header=TRUE)
  d <- generate.admin.addrm(logevents, current.userroles)
  d$wiki <- wiki
  return(d)
 }
 setwd("userroles_data/")
 wiki.files = paste0(wiki.list$wiki,".tsv")
 userroles <- rbindlist(lapply(wiki.files, load.rights.files))
 userroles$blocked <- grepl('^<span class="listusers_blockeduser">(.*?)$', userroles$role)
 userroles$role <- gsub('^<span class="listusers_blockeduser">(.*?)$','\\1', userroles$role)
 userroles$role <- gsub('^(.*?)</span>$','\\1', userroles$role)
 userroles[, is.action.admin := (role %in% c("sysop", "bureaucrat","sysop,bureaucrat","staff","admin","fanonadmin","steward"))]
 userroles[, is.action.bot := (role %in%  c("bot", "fyzbot","bot-global"))]
 bots = userroles[is.action.bot==TRUE]
 admins = userroles[is.action.admin==TRUE]
 setorder(bots,"timestamp")
 setorder(admins,"timestamp")
 ## we want to keep track of when the roles changed
 ## assume nobody was a bot or admin at the beginning of Mediawiki
 ## userroles[,':='(
 ##     prev.isbot = ifelse(is.na(prev.isbot),(isbot & action=="removed"),prev.isbot)
 bots[,
     ":="(
         role.period.begin = timestamp,
         role.period.end = shift(timestamp,fill=as.POSIXct("2017-01-01"),type="lead"))
    ,by = .(wiki,user)
     ]
 bots[,":="(is.bot = (action == "added"))]
 admins[,
       ":="(
           role.period.begin = timestamp,
           role.period.end = shift(timestamp,fill=as.POSIXct("2017-01-01"),type="lead"))
      ,by = .(wiki,user)
       ]
 admins[,":="(is.admin = (action == "added") )]
 # save data to an output file for knitr
 setwd("..");
 rm(load.rights.files)
 rm(wiki.files,userroles)
 if (!nosave) {
    saveRDS(bots, file="bots.RDS")
    saveRDS(admins, file="admins.RDS")
    saveRDS(r, file="lib-01-generate_userroles.RDS")
 }
--- a/lib-01-sample-datasets.R
+++ b/lib-01-sample-datasets.R
@@ -0,0 +1,57 @@
 # Functions for creating samples of datasets
 # Copyright (C) 2018  Nathan TeBlunthuis
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 sample.by.wiki <- function(dt,outcome,N.per.wiki=30){
    set.seed(0)
    sample.params <- dt[,.N,by=wiki.name]
    sample.params[,group.sd := dt[,.(group.sd=sd(.SD[[outcome]])),by=wiki.name]$group.sd]
    sample.params[,p.in.group := N / nrow(dt)]
    sample.params[,min.N := min(N)]
    sample.params[,n.from.group := pmin(min.N/(1-group.sd), N)]
    sample.params[,p.sampled := n.from.group/N]
    sample.params[,weight := 1/p.sampled]
    dt[sample.params,":="(prob=p.sampled,weight=weight),on=.(wiki.name)]
    sample.idx <- sample(nrow(dt),size=sum(sample.params$n.from.group,na.rm=TRUE),prob=dt$prob)
    return(dt[sample.idx])
 }
 sample.newcomers <- function()
 {
    wikis.to.remove <- newcomers[,.N,by="wiki.name"][N<30]$wiki.name
    remember(nrow(wikis.to.remove),"n.wikis.insufficient.newcomers")
    newcomers.presample <- newcomers[!(wiki.name  %in% wikis.to.remove)]
    newcomers.sample <- sample.by.wiki(newcomers.presample,"survives")
    return(newcomers.sample)
 }
 sample.ns4.edits <- function(){
    wikis.to.keep <- ns4.reg.edits[,.(.N,N.reverts=sum(reverted)),by=wiki.name][(N>30)&(N.reverts > 30)]
    ns4.reg.edits.sub <- ns4.reg.edits[wiki.name %in% wikis.to.keep$wiki.name]
    ns4.reg.edits.sample <- sample.by.wiki(ns4.reg.edits.sub,"reverted")
    return(ns4.reg.edits.sample)
 }
 sample.wiki.data <- function(){
    ## just choose 100 random wikis
    wikis.to.keep <- sample(unique(wiki.data$wiki.name),100)
    wiki.data.sample <- wiki.data[wiki.name %in% wikis.to.keep]
    return(wiki.data.sample)
 }
--- a/mediawiki_dump_tools/.gitignore
+++ b/mediawiki_dump_tools/.gitignore
@@ -0,0 +1,5 @@
 *.xml.gz
 *.7z
 *.xml.bz2
 *.xml.xz
 *.swp
--- a/mediawiki_dump_tools/.gitmodules
+++ b/mediawiki_dump_tools/.gitmodules
@@ -0,0 +1,3 @@
 [submodule "Mediawiki-Utilities"]
 	path = Mediawiki-Utilities
 	url = https://github.com/halfak/Mediawiki-Utilities.git
--- a/mediawiki_dump_tools/Mediawiki-Utilities/.gitignore
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/.gitignore
@@ -0,0 +1,46 @@
 # Demo files
 demo_*
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 # Temporary text editor files
 *~
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 env/
 bin/
 build/
 develop-eggs/
 dist/
 eggs/
 #lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .cache
 nosetests.xml
 coverage.xml
 # Sphinx documentation
 doc/_build/
 doc/.buildfile
 *.toctree
--- a/mediawiki_dump_tools/Mediawiki-Utilities/CHANGE_LOG.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/CHANGE_LOG.rst
@@ -0,0 +1,19 @@
 v0.4.4
 ======
 Adds API helper for persistence tracking and example script.
 v0.4.0
 ======
 Adds api.collections.users
 v0.3.8
 ======
 Adds support for spaces in XML dump filenames when using the dump mapper.
 v0.3.7
 ======
 Fixes pickling issues in Timestamp
--- a/mediawiki_dump_tools/Mediawiki-Utilities/LICENSE
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/LICENSE
@@ -0,0 +1,21 @@
 The MIT License (MIT)
 Copyright (c) 2014 Aaron Halfaker
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/mediawiki_dump_tools/Mediawiki-Utilities/MANIFEST.in
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/MANIFEST.in
@@ -0,0 +1 @@
 include LICENSE README.rst
--- a/mediawiki_dump_tools/Mediawiki-Utilities/README.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/README.rst
@@ -0,0 +1,25 @@
 ===================
 MediaWiki Utilities
 ===================
 MediaWiki Utilities is an open source (MIT Licensed) library developed by Aaron Halfaker for extracting and processing data from MediaWiki installations, slave databases and xml dumps.
 **Install with pip:** ``pip install mediawiki-utilities``
 **Note:** *Use of this library requires Python 3 or later.*
 **Documentation:** http://pythonhosted.org/mediawiki-utilities/
 About the author
 ================
 :name: 
 	Aaron Halfaker
 :email:
 	aaron.halfaker@gmail.com
 :website:
 	http://halfaker.info --
 	http://en.wikipedia.org/wiki/User:EpochFail
 Contributors
 ============
 None yet.  See http://github.com/halfak/mediawiki-utilities.  Pull requests are encouraged.
--- a/mediawiki_dump_tools/Mediawiki-Utilities/WORK_LOG.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/WORK_LOG.rst
@@ -0,0 +1,63 @@
 2014-06-02
 	After some reading, it looks like py3 will do something reasonable with re-raised errors, so I'm just going to let the error be re-raised and call it good.
 2014-05-31
 	I figured out that you just plain can't get a stack trace out of a multiprocessing.Process in such a way that you an re-associate it with its exception on the other side.  I'm now working on putting together a picklable container exception that I can use to manage and format the exceptions that come out of a mapping function.  It's not going great.
 2014-04-08
 	I've been extending the API.  I added list=deletedrevs and tested (fixed) the api.Session.login() method.  It all seems to work now.  I also did some minor cleanup on  lib.title.Parser to make the method names more explicit.
 	I'd like to start tracking changes so that I can build changelists to go with new versions.  For now, I'll keep track of substantial changes here.
 	* Released 0.2.1
 	* Added list=deletedrevs to api module
 2014-03-27
 	I just fixed up the structure for lib.reverts.database.check() and check_row().  You can give check_row() a database row or check() a rev_id and page_id.  The functions should then either return None or the first reverting revision they encounter.
 	I like this pattern.  Lib gets to reference core, but not vice versa.  I need to talk to the Wikimetrics people about implementing some of the metrics within a new lib.  Yet, one of the cool things about libs is that they don't necessarily need to be packaged with core.  So you could write something that makes use of core and other libs as a standalone package first and incorporate it later.  :D
 2014-03-20
 	Just a quick update today.  I realized that database.DB.add_args was setting
 	default values that won't make sense for anyone but me personally.  I cleared that up and added a way to set your own defaults.
 2014-03-18
 	Refactoring!  I've got a user.  He immediately found problems.  So I'm fixing them aggressively.  I just renamed the library back to "mw".  I also renamed the dump processing module to "xml_dump".  I hope that these name changes will make more sense.
 	I also moved the revert detection functionality out of the database module and into the lib.reverts module.  I think that this makes more sense.  If it is a core functionality, it should live in code.  If it is a library, it should only have other libraries depend on it.  If I need to write a magical DB abstractor in lib, so be it.
 2014-02-08
 	It's time to kill `mw.lib.changes`.  I just don't see that working as a core
 	part of this library.  It might make sense to return build up another library
 	to handle changes.  I'll have to get back to that at some other time.
 2013-12-23
 	Still hacking on `mw.lib.changes`.  It's the same set of issues described in
 	the last log.  I'm making progress building a params parser.  I think that my strategy is going to be to let the user handle params parsing themselves with 	a new `types.Protection` type.
 	Oh! And I did get `types.TimestampType` extended to have a `strptime` method.
 	That's all nice and tested.
 	Note that I think it might be a good idea to consolidate all defaults for
 	better documentation.
 	Anyway.  All tests are passing.  It's time to work on something else for a
 	little while.
 2013-12-19
 	Still working on `mw.lib.changes`.  I like the structure for the most part.  It looks like I'm going to have to join `revision` and `logging` to `recentchanges` in order construct an appropriate `change.Change` from a row.  That means I'm going to need a funny new method on `database.RecentChanges`.  That's going to confuse people.  Boo.
 	I also need to figure out a way to configure for the lame timestamp format that appears in blocks and page protections.  I think I'm going to extend `types.TimestampType` to have a `strptime` method.
 2013-12-18
 	Tests passing.  HistoricalMap was fine.  Will be code-complete once lib.changes is done.  Still need to figure out how I'm going to configure a title parser and pass it into the change constructor.  Also, I rediscovered how stupid the recentchanges table is.
 	OK.. New lame thing.  So, when you "protect" a page, the log keeps the following type of value in log_params:
 	``\u200e[edit=autoconfirmed] (expires 03:20, 21 November 2013 (UTC))``
 	That date format... It's not the long or short format for `Timestamp`. I think it is a custom format that changes on a wiki-to-wiki basis.
 	I feel sad.  This made my day worse.  It's important to remind myself of the fact that MediaWiki was not designed to allow me to reverse engineer it.
 2013-12-17
 	Test on revert detector failing since simplifying restructure.  I'm not sure what the issue is, but I suspect that I broke something in util.ordered.HistoricalMap. -halfak
--- a/mediawiki_dump_tools/Mediawiki-Utilities/debian/changelog
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/debian/changelog
@@ -0,0 +1,5 @@
 python3-mediawiki-utilities (0.4.16) UNRELEASED; urgency=medium
  * Initial version of the package
 -- yuvipanda <yuvipanda@riseup.net>  Tue, 04 Aug 2015 16:42:51 -0700
--- a/mediawiki_dump_tools/Mediawiki-Utilities/debian/compat
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/debian/compat
@@ -0,0 +1 @@
 9
--- a/mediawiki_dump_tools/Mediawiki-Utilities/debian/control
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/debian/control
@@ -0,0 +1,18 @@
 Source: python3-mediawiki-utilities
 Maintainer: Aaron Halfakar <aaron.halfakar@gmail.com>
 Section: python
 Priority: optional
 Build-Depends: python3-setuptools, python3-all, debhelper (>= 9), python3-nose, python3-pymysql, python3-requests
 Standards-Version: 3.9.6
 Package: python3-mediawiki-utilities
 Architecture: all
 Depends: ${misc:Depends}, ${python3:Depends}
 Description: Infrastructure for running webservices on tools.wmflabs.org
 Provides scripts and a python package for running and controlling
 user provided webservices on tools.wmflabs.org.
 .
 webservice-new is the user facing script that can start / stop / restart
 webservices when run from commandline in bastion hosts.
 webservice-runner is the script that starts on the exec hosts and
 exec's to the appropriate command to run the webserver itself.
--- a/mediawiki_dump_tools/Mediawiki-Utilities/debian/copyright
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/debian/copyright
@@ -0,0 +1,26 @@
 Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 Upstream-Name: mediawiki-utilities
 Files: *
 Copyright: 2014 Aaron Halfaker <aaron.halfaker@gmail.com>
 License: MIT
 License: MIT
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
 "Software"), to deal in the Software without restriction, including
 without limitation the rights to use, copy, modify, merge, publish,
 distribute, sublicense, and/or sell copies of the Software, and to
 permit persons to whom the Software is furnished to do so, subject to
 the following conditions:
 .
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 .
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/mediawiki_dump_tools/Mediawiki-Utilities/debian/rules
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/debian/rules
@@ -0,0 +1,4 @@
 #!/usr/bin/make -f
 %:
 	dh $@ --with python3 --buildsystem=pybuild
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/Makefile
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/Makefile
@@ -0,0 +1,182 @@
 # Makefile for Sphinx documentation
 #
 # You can set these variables from the command line.
 SPHINXOPTS    = -v
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 endif
 # Internal variables.
 PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 # the i18n builder cannot share the environment and doctrees with the others
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 help:
 	@echo "Please use \`make <target>' where <target> is one of"
 	@echo "  html       to make standalone HTML files"
 	@echo "  dirhtml    to make HTML files named index.html in directories"
 	@echo "  singlehtml to make a single large HTML file"
 	@echo "  pickle     to make pickle files"
 	@echo "  json       to make JSON files"
 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 	@echo "  qthelp     to make HTML files and a qthelp project"
 	@echo "  devhelp    to make HTML files and a Devhelp project"
 	@echo "  epub       to make an epub"
 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 	@echo "  text       to make text files"
 	@echo "  man        to make manual pages"
 	@echo "  texinfo    to make Texinfo files"
 	@echo "  info       to make Texinfo files and run them through makeinfo"
 	@echo "  gettext    to make PO message catalogs"
 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 	@echo "  xml        to make Docutils-native XML files"
 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 	@echo "  linkcheck  to check all external links for integrity"
 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 clean:
 	rm -rf $(BUILDDIR)/*
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 dirhtml:
 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 singlehtml:
 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 	@echo
 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 pickle:
 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 	@echo
 	@echo "Build finished; now you can process the pickle files."
 json:
 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 	@echo
 	@echo "Build finished; now you can process the JSON files."
 htmlhelp:
 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 	@echo
 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 qthelp:
 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 	@echo
 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/mediawiki-utilities.qhcp"
 	@echo "To view the help file:"
 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/mediawiki-utilities.qhc"
 devhelp:
 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 	@echo
 	@echo "Build finished."
 	@echo "To view the help file:"
 	@echo "# mkdir -p $$HOME/.local/share/devhelp/mediawiki-utilities"
 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/mediawiki-utilities"
 	@echo "# devhelp"
 epub:
 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 	@echo
 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 latex:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo
 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 	      "(use \`make latexpdf' here to do that automatically)."
 latexpdf:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo "Running LaTeX files through pdflatex..."
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 latexpdfja:
 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 	@echo "Running LaTeX files through platex and dvipdfmx..."
 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
 text:
 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
 	@echo
 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
 man:
 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
 	@echo
 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
 texinfo:
 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
 	@echo
 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
 	@echo "Run \`make' in that directory to run these through makeinfo" \
 	      "(use \`make info' here to do that automatically)."
 info:
 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
 	@echo "Running Texinfo files through makeinfo..."
 	make -C $(BUILDDIR)/texinfo info
 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
 gettext:
 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
 	@echo
 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
 changes:
 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
 	@echo
 	@echo "The overview file is in $(BUILDDIR)/changes."
 linkcheck:
 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
 	@echo
 	@echo "Link check complete; look for any errors in the above output " \
 	      "or in $(BUILDDIR)/linkcheck/output.txt."
 doctest:
 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
 	@echo "Testing of doctests in the sources finished, look at the " \
 	      "results in $(BUILDDIR)/doctest/output.txt."
 xml:
 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
 	@echo
 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
 pseudoxml:
 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
 	@echo
 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
 htmlzip: html
 	cd _build/html/ && \
 	zip -r ../../html.zip * && \
 	cd ../../
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/_static/PLACEHOLDER
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/_static/PLACEHOLDER
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/_templates/PLACEHOLDER
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/_templates/PLACEHOLDER
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/conf.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/conf.py
@@ -0,0 +1,267 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # mediawiki-utilities documentation build configuration file, created by
 # sphinx-quickstart on Thu Apr 10 17:31:47 2014.
 #
 # This file is execfile()d with the current directory set to its
 # containing dir.
 #
 # Note that not all possible configuration values are present in this
 # autogenerated file.
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 import sys
 import os
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, os.path.abspath('../'))
 import mw
 # -- General configuration ------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
 #needs_sphinx = '1.0'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.doctest',
    'sphinx.ext.todo',
    'sphinx.ext.coverage',
    'sphinx.ext.mathjax',
    'sphinx.ext.viewcode',
 ]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # The suffix of source filenames.
 source_suffix = '.rst'
 # The encoding of source files.
 #source_encoding = 'utf-8-sig'
 # The master toctree document.
 master_doc = 'index'
 # General information about the project.
 project = 'mediawiki-utilities'
 copyright = '2014, Aaron Halfaker'
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
 version = mw.__version__
 # The full version, including alpha/beta/rc tags.
 release = mw.__version__
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #language = None
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
 #today = ''
 # Else, today_fmt is used as the format for a strftime call.
 #today_fmt = '%B %d, %Y'
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 exclude_patterns = ['_build']
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
 #default_role = None
 # If true, '()' will be appended to :func: etc. cross-reference text.
 #add_function_parentheses = True
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
 #add_module_names = True
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
 #show_authors = False
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = 'sphinx'
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
 # If true, keep warnings as "system message" paragraphs in the built documents.
 #keep_warnings = False
 # -- Options for HTML output ----------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 html_theme = 'default'
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #html_theme_options = {}
 # Add any paths that contain custom themes here, relative to this directory.
 #html_theme_path = []
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
 #html_title = None
 # A shorter title for the navigation bar.  Default is the same as html_title.
 #html_short_title = None
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
 #html_logo = None
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
 #html_favicon = None
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 # directly to the root of the documentation.
 #html_extra_path = []
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
 #html_last_updated_fmt = '%b %d, %Y'
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
 #html_use_smartypants = True
 # Custom sidebar templates, maps document names to template names.
 #html_sidebars = {}
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
 #html_additional_pages = {}
 # If false, no module index is generated.
 #html_domain_indices = True
 # If false, no index is generated.
 #html_use_index = True
 # If true, the index is split into individual pages for each letter.
 #html_split_index = False
 # If true, links to the reST sources are added to the pages.
 #html_show_sourcelink = True
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
 #html_show_sphinx = True
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
 #html_show_copyright = True
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
 #html_use_opensearch = ''
 # This is the file name suffix for HTML files (e.g. ".xhtml").
 #html_file_suffix = None
 # Output file base name for HTML help builder.
 htmlhelp_basename = 'mediawiki-utilitiesdoc'
 # -- Options for LaTeX output ---------------------------------------------
 latex_elements = {
 # The paper size ('letterpaper' or 'a4paper').
 #'papersize': 'letterpaper',
 # The font size ('10pt', '11pt' or '12pt').
 #'pointsize': '10pt',
 # Additional stuff for the LaTeX preamble.
 #'preamble': '',
 }
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
  ('index', 'mediawiki-utilities.tex', 'mediawiki-utilities Documentation',
   'Aaron Halfaker', 'manual'),
 ]
 # The name of an image file (relative to this directory) to place at the top of
 # the title page.
 #latex_logo = None
 # For "manual" documents, if this is true, then toplevel headings are parts,
 # not chapters.
 #latex_use_parts = False
 # If true, show page references after internal links.
 #latex_show_pagerefs = False
 # If true, show URL addresses after external links.
 #latex_show_urls = False
 # Documents to append as an appendix to all manuals.
 #latex_appendices = []
 # If false, no module index is generated.
 #latex_domain_indices = True
 # -- Options for manual page output ---------------------------------------
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
    ('index', 'mediawiki-utilities', 'mediawiki-utilities Documentation',
     ['Aaron Halfaker'], 1)
 ]
 # If true, show URL addresses after external links.
 #man_show_urls = False
 # -- Options for Texinfo output -------------------------------------------
 # Grouping the document tree into Texinfo files. List of tuples
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
  ('index', 'mediawiki-utilities', 'mediawiki-utilities Documentation',
   'Aaron Halfaker', 'mediawiki-utilities', 'One line description of project.',
   'Miscellaneous'),
 ]
 # Documents to append as an appendix to all manuals.
 #texinfo_appendices = []
 # If false, no module index is generated.
 #texinfo_domain_indices = True
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
 #texinfo_show_urls = 'footnote'
 # If true, do not generate a @detailmenu in the "Top" node's menu.
 #texinfo_no_detailmenu = False
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/api.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/api.rst
@@ -0,0 +1,77 @@
 .. _mw.api:
 ===================================
 mw.api -- MediaWiki API abstraction
 ===================================
 This module contains a set of utilities for interacting with the MediaWiki API.
 Here's an example of a common usage pattern:
 	>>> from mw import api
 	>>> 
 	>>> session = api.Session("https://en.wikipedia.org/w/api.php")
 	>>> 
 	>>> revisions = session.revisions.query(
 	...     properties={'ids', 'content'},
 	...     titles={"User:EpochFail"},
 	...     direction="newer",
 	...     limit=3
 	... )
 	>>> 
 	>>> for rev in revisions:
 	...     print(
 	...             "rev_id={0}, length={1} characters".format(
 	...                     rev['revid'],
 	...                     len(rev.get('*', ""))
 	...             )
 	...     )
 	... 
 	rev_id=190055192, length=124 characters
 	rev_id=276121340, length=132 characters
 	rev_id=276121389, length=124 characters
 Session
 =======
 .. autoclass:: mw.api.Session
   :members:
   :member-order: bysource
 Collections
 ===========
 .. autoclass:: mw.api.DeletedRevisions
   :members:
 .. autoclass:: mw.api.Pages
   :members:
 .. autoclass:: mw.api.RecentChanges
   :members:
 .. autoclass:: mw.api.Revisions
   :members:
 .. autoclass:: mw.api.SiteInfo
   :members:
 .. autoclass:: mw.api.UserContribs
   :members:
 Errors
 ======
 .. autoclass:: mw.api.errors.APIError
   :members:
   :inherited-members:
 .. autoclass:: mw.api.errors.AuthenticationError
   :members:
   :inherited-members:
 .. autoclass:: mw.api.errors.MalformedResponse
   :members:
   :inherited-members:
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/database.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/database.rst
@@ -0,0 +1,53 @@
 .. _mw.database:
 =========================================
 mw.database -- MySQL database abstraction
 =========================================
 This module contains a set of utilities for interacting with MediaWiki databases.
 Here's an example of a common usage pattern:
 ::
 	from mw import database
 	db = database.DB.from_params(
 		host="s1-analytics-slave.eqiad.wmnet", 
 		read_default_file="~/.my.cnf", 
 		user="research", 
 		db="enwiki"
 	)
 	revisions = db.revisions.query(user_id=9133062)
 	for rev_row in revisions:
 		rev_row['rev_id']
 DB
 ======
 .. autoclass:: mw.database.DB
   :members:
   :member-order: bysource
 Collections
 ===========
 .. autoclass:: mw.database.Archives
   :members:
 .. autoclass:: mw.database.AllRevisions
   :members:
 .. autoclass:: mw.database.Pages
   :members:
 .. autoclass:: mw.database.RecentChanges
   :members:
 .. autoclass:: mw.database.Revisions
   :members:
 .. autoclass:: mw.database.Users
   :members:
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/xml_dump.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/core/xml_dump.rst
@@ -0,0 +1,52 @@
 .. _mw.xml_dump:
 ==================================
 mw.xml_dump -- XML dump processing
 ==================================
 .. automodule:: mw.xml_dump
 The map() function
 ==================
 .. autofunction:: mw.xml_dump.map
 Iteration
 =========
 .. autoclass:: mw.xml_dump.Iterator
   :members:
   :member-order: bysource
 .. autoclass:: mw.xml_dump.Page
   :members:
   :member-order: bysource
 .. autoclass:: mw.xml_dump.Redirect
   :members:
   :member-order: bysource
 .. autoclass:: mw.xml_dump.Revision
   :members:
   :member-order: bysource
 .. autoclass:: mw.xml_dump.Comment
   :members:
   :member-order: bysource
 .. autoclass:: mw.xml_dump.Contributor
   :members:
   :member-order: bysource
 .. autoclass:: mw.xml_dump.Text
   :members:
   :member-order: bysource
 Errors
 ======
 .. autoclass:: mw.xml_dump.errors.FileTypeError
   :members:
 .. autoclass:: mw.xml_dump.errors.MalformedXML
   :members:
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/index.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/index.rst
@@ -0,0 +1,100 @@
 .. mediawiki-utilities documentation master file, created by
   sphinx-quickstart on Thu Apr 10 17:31:47 2014.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.
 ===================
 MediaWiki Utilities
 ===================
 MediaWiki Utilities is an open source (MIT Licensed) library developed by Aaron Halfaker for extracting and processing data from MediaWiki installations, slave databases and xml dumps.
 **Instal with pip:** ``pip install mediawiki-utilities``
 **Note:** *Use of this library requires Python 3 or later.*
 Types
 =====
 :ref:`mw.Timestamp <mw.types>`
 	A simple datatype for handling MediaWiki's various time formats.
 Core modules
 ============
 :ref:`mw.api <mw.api>`
 	A set of utilities for interacting with MediaWiki's web API.
 	* :class:`~mw.api.Session` -- Constructs an API session with a MediaWiki installation.  Contains convenience methods for accessing ``prop=revisions``,  ``list=usercontribs``, ``meta=siteinfo``, ``list=deletedrevs`` and ``list=recentchanges``.
 :ref:`mw.database <mw.database>`
 	A set of utilities for interacting with MediaWiki's database.
 	* :class:`~mw.database.DB` -- Constructs a mysql database connector with convenience methods for accessing ``revision``, ``archive``, ``page``, ``user``, and ``recentchanges``.
 :ref:`mw.xml_dump <mw.xml_dump>`
 	A set of utilities for processing MediaWiki's XML database dumps quickly and without dealing with streaming XML. 
 	* :func:`~mw.xml_dump.map` -- Applies a function to a set of dump files (:class:`~mw.xml_dump.Iterator`) using :class:`multiprocessing` and aggregates the output.
 	* :class:`~mw.xml_dump.Iterator` -- Constructs an iterator over a standard XML dump.  Dumps contain site_info and pages.  Pages contain metadata and revisions.  Revisions contain metadata and text.  This is probably why you are here.
 Libraries
 =========
 :ref:`mw.lib.persistence <mw.lib.persistence>`
 	A set of utilities for tracking the persistence of content between revisions.
 	* :class:`~mw.lib.persistence.State` -- Constructs an object that represents the current content persistence state of a page.  Reports useful details about the persistence of content when updated.
 :ref:`mw.lib.reverts <mw.lib.reverts>`
 	A set of utilities for performing revert detection
 	* :func:`~mw.lib.reverts.detect` -- Detects reverts in a sequence of revision events.
 	* :class:`~mw.lib.reverts.Detector` -- Constructs an identity revert detector that can be updated manually over the history of a page. 
 :ref:`mw.lib.sessions <mw.lib.sessions>`
 	A set of utilities for grouping revisions and other events into sessions
 	* :func:`~mw.lib.sessions.cluster` -- Clusters a sequence of user actions into sessions.
 	* :class:`~mw.lib.sessions.Cache` -- Constructs a cache of recent user actions that can be updated manually in order to detect sessions.
 :ref:`mw.lib.title <mw.lib.title>`
 	A set of utilities for normalizing and parsing page titles
 	* :func:`~mw.lib.title.normalize` -- Normalizes a page title.  
 	* :class:`~mw.lib.title.Parser` -- Constructs a parser with a set of namespaces that can be used to parse and normalize page titles. 
 About the author
 ================
 :name: 
 	Aaron Halfaker
 :email:
 	aaron.halfaker@gmail.com
 :website:
 	http://halfaker.info --
 	http://en.wikipedia.org/wiki/User:EpochFail
 Contributors
 ============
 None yet.  See http://github.com/halfak/mediawiki-utilities.  Pull requests are encouraged.
 Indices and tables
 ==================
 .. toctree::
   :maxdepth: 2
   types
   core/api
   core/database
   core/xml_dump
   lib/persistence
   lib/reverts
   lib/sessions
   lib/title
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/persistence.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/persistence.rst
@@ -0,0 +1,35 @@
 .. _mw.lib.persistence:
 =======================================================
 mw.lib.persistence -- tracking content between revisions
 =======================================================
 .. autoclass:: mw.lib.persistence.State
 	:members:
 Tokenization
 ============
 .. autoclass:: mw.lib.persistence.Tokens
 	:members:
 .. autoclass:: mw.lib.persistence.Token
 	:members:
 .. automodule:: mw.lib.persistence.tokenization
   :members:
   :member-order: bysource
 Difference
 ==========
 .. automodule:: mw.lib.persistence.difference
   :members:
   :member-order: bysource
 Constants
 =========
 .. automodule:: mw.lib.persistence.defaults
   :members:
   :member-order: bysource
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/reverts.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/reverts.rst
@@ -0,0 +1,30 @@
 .. _mw.lib.reverts:
 =============================================
 mw.lib.reverts -- detecting reverts
 =============================================
 .. automodule:: mw.lib.reverts
 .. autofunction:: mw.lib.reverts.detect
 .. autoclass:: mw.lib.reverts.Revert
 .. autoclass:: mw.lib.reverts.Detector
   :members:
 Convenience functions
 =====================
 .. automodule:: mw.lib.reverts.api
   :members:
   :member-order: bysource
 .. automodule:: mw.lib.reverts.database
   :members:
   :member-order: bysource
 Constants
 =========
 .. automodule:: mw.lib.reverts.defaults
   :members:
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/sessions.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/sessions.rst
@@ -0,0 +1,18 @@
 .. _mw.lib.sessions:
 ===================================
 mw.lib.sessions -- event clustering
 ===================================
 .. autofunction:: mw.lib.sessions.cluster
 .. autoclass:: mw.lib.sessions.Session
 .. autoclass:: mw.lib.sessions.Cache
 	:members:
 Constants
 =========
 .. automodule:: mw.lib.sessions.defaults
   :members:
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/title.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/lib/title.rst
@@ -0,0 +1,15 @@
 .. _mw.lib.title:
 ============================================================
 mw.lib.title -- parsing and normalizing titles
 ============================================================
 .. autofunction:: mw.lib.title.normalize
 Title parser
 ================
 .. autoclass:: mw.lib.title.Parser
   :members:
   :member-order: bysource
--- a/mediawiki_dump_tools/Mediawiki-Utilities/doc/types.rst
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/doc/types.rst
@@ -0,0 +1,11 @@
 .. _mw.types:
 ========================
 mw.types -- common types
 ========================
 .. autoclass:: mw.Timestamp
   :members:
 .. autoclass:: mw.Namespace
   :members:
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.deleted_revisions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.deleted_revisions.py
@@ -0,0 +1,37 @@
 """
 Prints the rev_id, characters and hash of all revisions to Willy_on_Wheels.
 """
 import getpass
 import hashlib
 import os
 import sys
 try:
    sys.path.insert(0, os.path.abspath(os.getcwd()))
    from mw import api
 except: raise
 api_session = api.Session("https://en.wikipedia.org/w/api.php")
 print("(EN) Wikipedia credentials...")
 username = input("Username: ")
 password = getpass.getpass("Password: ")
 api_session.login(username, password)
 revisions = api_session.deleted_revisions.query(
    properties={'ids', 'content'},
    titles={'Willy on Wheels'},
    direction="newer"
 )
 for rev in revisions:
    print(
        "{0} ({1} chars): {2}".format(
            rev['revid'],
            len(rev.get('*', "")),
            hashlib.sha1(bytes(rev.get('*', ""), 'utf8')).hexdigest()
        )
    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.py
@@ -0,0 +1,19 @@
 """
 Prints the rev_id of all revisions to User:EpochFail.
 """
 import sys
 import os
 sys.path.insert(0, os.path.abspath(os.getcwd()))
 from mw import api
 api_session = api.Session("https://en.wikipedia.org/w/api.php")
 revisions = api_session.revisions.query(
    properties={'ids'},
    titles={'User:TestAccountForMWUtils'}
 )
 for rev in revisions:
    print(rev['revid'])
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.recent_changes.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.recent_changes.py
@@ -0,0 +1,30 @@
 """
 Prints the rev_id and hash of the 10 oldest edits in recent_changes.
 """
 import os
 import sys
 try:
    sys.path.insert(0, os.path.abspath(os.getcwd()))
    from mw import api
 except:
    raise
 api_session = api.Session("https://en.wikipedia.org/w/api.php")
 changes = api_session.recent_changes.query(
    type={'edit', 'new'},
    properties={'ids', 'sha1', 'timestamp'},
    direction="newer",
    limit=10
 )
 for change in changes:
    print(
        "{0} ({1}) @ {2}: {3}".format(
            change['rcid'],
            change['type'],
            change['timestamp'],
            change.get('sha1', "")
        )
    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.revisions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.revisions.py
@@ -0,0 +1,28 @@
 """
 Prints the rev_id, characters and hash of all revisions to User:EpochFail.
 """
 import sys
 import os
 sys.path.insert(0, os.path.abspath(os.getcwd()))
 import hashlib
 from mw import api
 api_session = api.Session("https://en.wikipedia.org/w/api.php")
 revisions = api_session.revisions.query(
    properties={'ids', 'content'},
    titles={"User:EpochFail"},
    direction="newer",
    limit=51
 )
 for rev in revisions:
    print(
        "{0} ({1} chars): {2}".format(
            rev['revid'],
            len(rev.get('*', "")),
            hashlib.sha1(bytes(rev.get('*', ""), 'utf8')).hexdigest()
        )
    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.users.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/api.users.py
@@ -0,0 +1,20 @@
 """
 Prints the rev_id, characters and hash of all revisions to User:EpochFail.
 """
 import os
 import sys
 try:
    sys.path.insert(0, os.path.abspath(os.getcwd()))
    from mw import api
 except:
    raise
 api_session = api.Session("https://en.wikipedia.org/w/api.php")
 user_docs = api_session.users.query(
    users=["EpochFail", "Halfak (WMF)"]
 )
 for user_doc in user_docs:
    print(user_doc)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/database.users.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/database.users.py
@@ -0,0 +1,31 @@
 """
 """
 import os
 import sys
 try:
    sys.path.insert(0, os.path.abspath(os.getcwd()))
    from mw import database
 except:
    raise
 db = database.DB.from_params(
    host="analytics-store.eqiad.wmnet",
    read_default_file="~/.my.cnf",
    user="research",
    db="enwiki"
 )
 users = db.users.query(
    registered_after="20140101000000",
    direction="newer",
    limit=10
 )
 for user in users:
    print("{user_id}:{user_name} -- {user_editcount} edits".format(**user))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump.xml
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump.xml
@@ -0,0 +1,59 @@
 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
           xsi:schemaLocation="//www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd"
           version="0.8" xml:lang="en">
  <siteinfo>
    <sitename>Wikipedia</sitename>
    <base>http://en.wikipedia.org/wiki/Main_Page</base>
    <generator>MediaWiki 1.22wmf2</generator>
    <case>first-letter</case>
    <namespaces>
      <namespace key="0" case="first-letter" />
      <namespace key="1" case="first-letter">Talk</namespace>
    </namespaces>
  </siteinfo>
  <page>
    <title>Foo</title>
    <ns>0</ns>
    <id>1</id>
    <revision>
      <id>1</id>
      <timestamp>2004-08-09T09:04:08Z</timestamp>
      <contributor>
        <username>Gen0cide</username>
        <id>92182</id>
      </contributor>
      <text xml:space="preserve">Revision 1 text</text>
      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
      <model>wikitext</model>
      <format>text/x-wiki</format>
    </revision>
    <revision>
      <id>2</id>
      <timestamp>2004-08-10T09:04:08Z</timestamp>
      <contributor>
        <ip>222.152.210.109</ip>
      </contributor>
      <text xml:space="preserve">Revision 2 text</text>
      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
      <model>wikitext</model>
      <comment>Comment 2</comment>
      <format>text/x-wiki</format>
    </revision>
  </page>
  <page>
    <title>Bar</title>
    <ns>1</ns>
    <id>2</id>
    <revision>
      <id>3</id>
      <timestamp>2004-08-11T09:04:08Z</timestamp>
      <contributor>
        <ip>222.152.210.22</ip>
      </contributor>
      <text xml:space="preserve">Revision 3 text</text>
      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
      <model>wikitext</model>
      <format>text/x-wiki</format>
    </revision>
  </page>
 </mediawiki>
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump2.xml
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/dump2.xml
@@ -0,0 +1,31 @@
 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
           xsi:schemaLocation="//www.mediawiki.org/xml/export-0.8/ http://www.mediawiki.org/xml/export-0.8.xsd"
           version="0.8" xml:lang="en">
  <siteinfo>
    <sitename>Wikipedia</sitename>
    <base>http://en.wikipedia.org/wiki/Main_Page</base>
    <generator>MediaWiki 1.22wmf2</generator>
    <case>first-letter</case>
    <namespaces>
      <namespace key="0" case="first-letter" />
      <namespace key="1" case="first-letter">Talk</namespace>
    </namespaces>
  </siteinfo>
  <page>
    <title>Herp</title>
    <ns>1</ns>
    <id>2</id>
    <revision>
      <id>4</id>
      <timestamp>2004-08-11T09:04:08Z</timestamp>
      <contributor>
        <id>10</id>
        <name>FOobar!?</name>
      </contributor>
      <text xml:space="preserve">Revision 4 text</text>
      <sha1>g9chqqg94myzq11c56ixvq7o1yg75n9</sha1>
      <model>wikitext</model>
      <format>text/x-wiki</format>
    </revision>
  </page>
 </mediawiki>
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.persistence.api.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.persistence.api.py
@@ -0,0 +1,19 @@
 import pprint
 import re
 from mw.api import Session
 from mw.lib import persistence
 session = Session("https://en.wikipedia.org/w/api.php")
 rev, tokens_added, future_revs = persistence.api.score(session, 560561013,
                                                       properties={'user'})
 words_re = re.compile("\w+", re.UNICODE)
 print("Words added")
 for token in tokens_added:
    if words_re.search(token.text):
        print("'{0}' survived:".format(token.text))
        for frev in token.revisions:
            print("\t{revid} by {user}".format(**frev))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.api.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.api.py
@@ -0,0 +1,18 @@
 """
 Prints the reverting rev_id, rev_id and reverted to rev_id of all reverted
 revisions made by user "PermaNoob".
 """
 from mw.api import Session
 from mw.lib import reverts
 session = Session("https://en.wikipedia.org/w/api.php")
 revisions = session.user_contribs.query(user={"PermaNoob"}, direction="newer")
 for rev in revisions:
    revert = reverts.api.check_rev(session, rev, window=60*60*24*2)
    if revert is not None:
        print("{0} reverted {1} to {2}".format(
            revert.reverting['revid'],
            rev['revid'],
            revert.reverted_to['revid'])
        )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.database.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.database.py
@@ -0,0 +1,23 @@
 """
 Prints the reverting rev_id, rev_id and reverted to rev_id of all reverted
 revisions made by user with ID 9133062.
 """
 from mw.database import DB
 from mw.lib import reverts
 db = DB.from_params(
    host="s1-analytics-slave.eqiad.wmnet",
    read_default_file="~/.my.cnf",
    user="research",
    db="enwiki"
 )
 revisions = db.revisions.query(user_id=9133062)
 for rev_row in revisions:
    revert = reverts.database.check_row(db, rev_row)
    if revert is not None:
        print("{0} reverted {1} to {2}".format(
            revert.reverting['rev_id'],
            rev_row['rev_id'],
            revert.reverted_to['rev_id'])
        )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.reverts.py
@@ -0,0 +1,21 @@
 """
 Prints all reverted revisions of User:EpochFail.
 """
 from mw.api import Session
 from mw.lib import reverts
 # Gather a page's revisions from the API
 api_session = Session("https://en.wikipedia.org/w/api.php")
 revs = api_session.revisions.query(
    titles={"User:EpochFail"},
    properties={'ids', 'sha1'},
    direction="newer"
 )
 # Creates a revsion event iterator
 rev_events = ((rev['sha1'], rev) for rev in revs)
 # Detect and print reverts
 for revert in reverts.detect(rev_events):
    print("{0} reverted back to {1}".format(revert.reverting['revid'],
                                            revert.reverted_to['revid']))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.sessions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.sessions.py
@@ -0,0 +1,17 @@
 """
 Prints out session information for user "TextAccountForMWUtils"
 """
 from mw.api import Session
 from mw.lib import sessions
 # Gather a user's revisions from the API
 api_session = Session("https://en.wikipedia.org/w/api.php")
 revs = api_session.user_contribs.query(
    user={"TestAccountForMWUtils"},
    direction="newer"
 )
 rev_events = ((rev['user'], rev['timestamp'], rev) for rev in revs)
 # Extract and print sessions
 for user, session in sessions.cluster(rev_events):
    print("{0}'s session with {1} revisions".format(user, len(session)))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.title.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/lib.title.py
@@ -0,0 +1,26 @@
 """
 Demonstrates title normalization and parsing.
 """
 import sys
 import os
 sys.path.insert(0, os.path.abspath(os.getcwd()))
 from mw.api import Session
 from mw.lib import title
 # Normalize titles
 title.normalize("foo bar")
 # > "Foo_bar"
 # Construct a title parser from the API
 api_session = Session("https://en.wikipedia.org/w/api.php")
 parser = title.Parser.from_api(api_session)
 # Handles normalization
 parser.parse("user:epochFail")
 # > 2, "EpochFail"
 # Handles namespace aliases
 parser.parse("WT:foobar")
 # > 5, "Foobar"
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/timestamp.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/timestamp.py
@@ -0,0 +1,27 @@
 """
 Demonstrates some simple Timestamp operations
 """
 from mw import Timestamp
 # Seconds since Unix Epoch
 str(Timestamp(1234567890))
 # > '20090213233130'
 # Database format
 int(Timestamp("20090213233130"))
 # > 1234567890
 # API format
 int(Timestamp("2009-02-13T23:31:30Z"))
 # > 1234567890
 # Difference in seconds
 Timestamp("2009-02-13T23:31:31Z") - Timestamp(1234567890)
 # > 1
 # strptime and strftime
 Timestamp(1234567890).strftime("%Y foobar")
 # > '2009 foobar'
 str(Timestamp.strptime("2009 derp 10", "%Y derp %m"))
 # > '20091001000000'
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.iteration.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.iteration.py
@@ -0,0 +1,14 @@
 """
 Prints out all rev_ids that appear in dump.xml.
 """
 from mw.xml_dump import Iterator
 # Construct dump file iterator
 dump = Iterator.from_file(open("examples/dump.xml"))
 # Iterate through pages
 for page in dump:
        # Iterate through a page's revisions
        for revision in page:
                print(revision.id)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.map.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/examples/xml_dump.map.py
@@ -0,0 +1,15 @@
 """
 Processes two dump files.
 """
 from mw import xml_dump
 files = ["examples/dump.xml", "examples/dump2.xml"]
 def page_info(dump, path):
    for page in dump:
        yield page.id, page.namespace, page.title
 for page_id, page_namespace, page_title in xml_dump.map(files, page_info):
    print("\t".join([str(page_id), str(page_namespace), page_title]))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/init.py
@@ -0,0 +1,3 @@
 from .types import Timestamp, Namespace
 __version__ = "0.4.18"
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/init.py
@@ -0,0 +1,5 @@
 from . import errors
 from .session import Session
 from .collections import Pages, RecentChanges, Revisions, SiteInfo, \
    UserContribs, DeletedRevisions
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/init.py
@@ -0,0 +1,7 @@
 from .deleted_revisions import DeletedRevisions
 from .pages import Pages
 from .recent_changes import RecentChanges
 from .revisions import Revisions
 from .site_info import SiteInfo
 from .user_contribs import UserContribs
 from .users import Users
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/collection.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/collection.py
@@ -0,0 +1,68 @@
 import re
 class Collection:
    """
    Represents a collection of items that can be queried via the API.  This is
    an abstract base class that should be extended
    """
    TIMESTAMP = re.compile(r"[0-9]{4}-?[0-9]{2}-?[0-9]{2}T?" +
                           r"[0-9]{2}:?[0-9]{2}:?[0-9]{2}Z?")
    """
    A regular expression for matching the API's timestamp format.
    """
    DIRECTIONS = {'newer', 'older'}
    """
    A set of potential direction names.
    """
    def __init__(self, session):
        """
        :Parameters:
            session : `mw.api.Session`
                An api session to use for post & get.
        """
        self.session = session
    def _check_direction(self, direction):
        if direction is None:
            return direction
        else:
            direction = str(direction)
            assert direction in {None} | self.DIRECTIONS, \
                "Direction must be one of {0}".format(self.DIRECTIONS)
            return direction
    def _check_timestamp(self, timestamp):
        if timestamp is None:
            return timestamp
        else:
            timestamp = str(timestamp)
            if not self.TIMESTAMP.match(timestamp):
                raise TypeError(
                    "{0} is not formatted like ".format(repr(timestamp)) +
                    "a MediaWiki timestamp."
                )
            return timestamp
    def _items(self, items, none=True, levels=None, type=lambda val: val):
        if none and items is None:
            return None
        else:
            items = {str(type(item)) for item in items}
            if levels is not None:
                levels = {str(level) for level in levels}
                assert len(items - levels) == 0, \
                    "items {0} not in levels {1}".format(
                        items - levels, levels)
            return "|".join(items)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/deleted_revisions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/deleted_revisions.py
@@ -0,0 +1,150 @@
 import logging
 import sys
 from ...types import Timestamp
 from ...util import none_or
 from ..errors import MalformedResponse
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.deletedrevs")
 class DeletedRevisions(Collection):
    PROPERTIES = {'ids', 'flags', 'timestamp', 'user', 'userid', 'size',
                  'sha1', 'contentmodel', 'comment', 'parsedcomment', 'content',
                  'tags'}
    # TODO:
    # This is *not* the right way to do this, but it should work for all queries.
    MAX_REVISIONS = 500
    def get(self, rev_id, *args, **kwargs):
        rev_id = int(rev_id)
        revs = list(self.query(revids={rev_id}, **kwargs))
        if len(revs) < 1:
            raise KeyError(rev_id)
        else:
            return revs[0]
    def query(self, *args, limit=sys.maxsize, **kwargs):
        """
        Queries deleted revisions.
        See https://www.mediawiki.org/wiki/API:Deletedrevs
        :Parameters:
            titles : set(str)
                A set of page names to query (note that namespace prefix is expected)
            start : :class:`mw.Timestamp`
                A timestamp to start querying from
            end : :class:`mw.Timestamp`
                A timestamp to end querying
            from_title : str
                A title from which to start querying (alphabetically)
            to_title : str
                A title from which to stop querying (alphabetically)
            prefix : str
                A title prefix to match on
            drcontinue : str
                When more results are available, use this to continue (3) Note: may only work if drdir is set to newer.
            unique : bool
                List only one revision for each page
            tag : str
                Only list revision tagged with this tag
            user : str
                Only list revisions saved by this user_text
            excludeuser : str
                Do not list revision saved by this user_text
            namespace : int
                Only list pages in this namespace (id)
            limit : int
                Limit the number of results
            direction : str
                "newer" or "older"
            properties : set(str)
                A list of properties to include in the results:
                * ids            - The ID of the revision.
                * flags          - Revision flags (minor).
                * timestamp      - The timestamp of the revision.
                * user           - User that made the revision.
                * userid         - User ID of the revision creator.
                * size           - Length (bytes) of the revision.
                * sha1           - SHA-1 (base 16) of the revision.
                * contentmodel   - Content model ID of the revision.
                * comment        - Comment by the user for the revision.
                * parsedcomment  - Parsed comment by the user for the revision.
                * content        - Text of the revision.
                * tags           - Tags for the revision.
        """
        # `limit` means something diffent here
        kwargs['limit'] = min(limit, self.MAX_REVISIONS)
        revisions_yielded = 0
        done = False
        while not done and revisions_yielded <= limit:
            rev_docs, query_continue = self._query(*args, **kwargs)
            for doc in rev_docs:
                yield doc
                revisions_yielded += 1
                if revisions_yielded >= limit:
                    break
            if query_continue != "" and len(rev_docs) > 0:
                kwargs['query_continue'] = query_continue
            else:
                done = True
    def _query(self, titles=None, pageids=None, revids=None,
               start=None, end=None, query_continue=None, unique=None, tag=None,
               user=None, excludeuser=None, namespace=None, limit=None,
               properties=None, direction=None):
        params = {
            'action': "query",
            'prop': "deletedrevisions"
        }
        params['titles'] = self._items(titles)
        params['pageids'] = self._items(pageids)
        params['revids'] = self._items(revids)
        params['drvprop'] = self._items(properties, levels=self.PROPERTIES)
        params['drvlimit'] = none_or(limit, int)
        params['drvstart'] = self._check_timestamp(start)
        params['drvend'] = self._check_timestamp(end)
        params['drvdir'] = self._check_direction(direction)
        params['drvuser'] = none_or(user, str)
        params['drvexcludeuser'] = none_or(excludeuser, int)
        params['drvtag'] = none_or(tag, str)
        params.update(query_continue or {'continue': ""})
        doc = self.session.get(params)
        doc_copy = dict(doc)
        try:
            if 'continue' in doc:
                query_continue = doc['continue']
            else:
                query_continue = ''
            pages = doc['query']['pages'].values()
            rev_docs = []
            for page_doc in pages:
                page_rev_docs = page_doc.get('deletedrevisions', [])
                try: del page_doc['deletedrevisions']
                except KeyError: pass
                for rev_doc in page_rev_docs:
                    rev_doc['page'] = page_doc
                rev_docs.extend(page_rev_docs)
            return rev_docs, query_continue
        except KeyError as e:
            raise MalformedResponse(str(e), doc)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/pages.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/pages.py
@@ -0,0 +1,50 @@
 import logging
 from ...util import none_or
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.pages")
 class Pages(Collection):
    """
    TODO
    """
    def _edit(self, title=None, pageid=None, section=None, sectiontitle=None,
              text=None, token=None, summary=None, minor=None,
              notminor=None, bot=None, basetimestamp=None,
              starttimestamp=None, recreate=None, createonly=None,
              nocreate=None, watch=None, unwatch=None, watchlist=None,
              md5=None, prependtext=None, appendtext=None, undo=None,
              undoafter=None, redirect=None, contentformat=None,
              contentmodel=None, assert_=None, nassert=None,
              captchaword=None, captchaid=None):
        params = {
            'action': "edit"
        }
        params['title'] = none_or(title, str)
        params['pageid'] = none_or(pageid, int)
        params['section'] = none_or(section, int, levels={'new'})
        params['sectiontitle'] = none_or(sectiontitle, str)
        params['text'] = none_or(text, str)
        params['token'] = none_or(token, str)
        params['summary'] = none_or(summary, str)
        params['minor'] = none_or(minor, bool)
        params['notminor'] = none_or(notminor, bool)
        params['bot'] = none_or(bot, bool)
        params['basetimestamp'] = self._check_timestamp(basetimestamp)
        params['starttimestamp'] = self._check_timestamp(starttimestamp)
        params['recreate'] = none_or(recreate, bool)
        params['createonly'] = none_or(createonly, bool)
        params['nocreate'] = none_or(nocreate, bool)
        params['watch'] = none_or(watch, bool)
        params['unwatch'] = none_or(unwatch, bool)
        params['watchlist'] = none_or(watchlist, bool)
        params['md5'] = none_or(md5, str)
        params['prependtext'] = none_or(prependtext, str)
        params['appendtext'] = none_or(appendtext, str)
        params['undo'] = none_or(undo, int)
        params['undoafter'] = none_or(undoafter, int)
        # TODO finish this
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/recent_changes.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/recent_changes.py
@@ -0,0 +1,192 @@
 import logging
 import re
 from ...util import none_or
 from ..errors import MalformedResponse
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.recent_changes")
 class RecentChanges(Collection):
    """
    Recent changes (revisions, page creations, registrations, moves, etc.)
    """
    RCCONTINUE = re.compile(r"([0-9]{4}-[0-9]{2}-[0-9]{2}T" +
                            r"[0-9]{2}:[0-9]{2}:[0-9]{2}Z|" +
                            r"[0-9]{14})" +
                            r"\|[0-9]+")
    PROPERTIES = {'user', 'userid', 'comment', 'timestamp', 'title',
                  'ids', 'sizes', 'redirect', 'flags', 'loginfo',
                  'tags', 'sha1'}
    SHOW = {'minor', '!minor', 'bot', '!bot', 'anon', '!anon',
            'redirect', '!redirect', 'patrolled', '!patrolled'}
    TYPES = {'edit', 'external', 'new', 'log'}
    DIRECTIONS = {'newer', 'older'}
    MAX_CHANGES = 50
    def _check_rccontinue(self, rccontinue):
        if rccontinue is None:
            return None
        elif self.RCCONTINUE.match(rccontinue):
            return rccontinue
        else:
            raise TypeError(
                "rccontinue {0} is not formatted correctly ".format(rccontinue) +
                "'%Y-%m-%dT%H:%M:%SZ|<last_rcid>'"
            )
    def query(self, *args, limit=None, **kwargs):
        """
        Enumerate recent changes.
        See `<https://www.mediawiki.org/wiki/API:Recentchanges>`_
        :Parameters:
            start : :class:`mw.Timestamp`
                The timestamp to start enumerating from
            end : :class:`mw.Timestamp`
                The timestamp to end enumerating
            direction :
                "newer" or "older"
            namespace : int
                Filter log entries to only this namespace(s)
            user : str
                Only list changes by this user
            excludeuser : str
                Don't list changes by this user
            tag : str
                Only list changes tagged with this tag
            properties : set(str)
                Include additional pieces of information
                * user           - Adds the user responsible for the edit and tags if they are an IP
                * userid         - Adds the user id responsible for the edit
                * comment        - Adds the comment for the edit
                * parsedcomment  - Adds the parsed comment for the edit
                * flags          - Adds flags for the edit
                * timestamp      - Adds timestamp of the edit
                * title          - Adds the page title of the edit
                * ids            - Adds the page ID, recent changes ID and the new and old revision ID
                * sizes          - Adds the new and old page length in bytes
                * redirect       - Tags edit if page is a redirect
                * patrolled      - Tags patrollable edits as being patrolled or unpatrolled
                * loginfo        - Adds log information (logid, logtype, etc) to log entries
                * tags           - Lists tags for the entry
                * sha1           - Adds the content checksum for entries associated with a revision
            token : set(str)
                Which tokens to obtain for each change
                * patrol
            show : set(str)
                Show only items that meet this criteria. For example, to see
                only minor edits done by logged-in users, set
                show={'minor', '!anon'}.
                * minor
                * !minor
                * bot
                * !bot
                * anon
                * !anon
                * redirect
                * !redirect
                * patrolled
                * !patrolled
                * unpatrolled
            limit : int
                How many total changes to return
            type : set(str)
                Which types of changes to show
                * edit
                * external
                * new
                * log
            toponly : bool
                Only list changes which are the latest revision
            rccontinue : str
                Use this to continue loading results from where you last left off
        """
        limit = none_or(limit, int)
        changes_yielded = 0
        done = False
        while not done:
            if limit is None:
                kwargs['limit'] = self.MAX_CHANGES
            else:
                kwargs['limit'] = min(limit - changes_yielded, self.MAX_CHANGES)
            rc_docs, rccontinue = self._query(*args, **kwargs)
            for doc in rc_docs:
                yield doc
                changes_yielded += 1
                if limit is not None and changes_yielded >= limit:
                    done = True
                    break
            if rccontinue is not None and len(rc_docs) > 0:
                kwargs['rccontinue'] = rccontinue
            else:
                done = True
    def _query(self, start=None, end=None, direction=None, namespace=None,
               user=None, excludeuser=None, tag=None, properties=None,
               token=None, show=None, limit=None, type=None,
               toponly=None, rccontinue=None):
        params = {
            'action': "query",
            'list': "recentchanges"
        }
        params['rcstart'] = none_or(start, str)
        params['rcend'] = none_or(end, str)
        assert direction in {None} | self.DIRECTIONS, \
            "Direction must be one of {0}".format(self.DIRECTIONS)
        params['rcdir'] = direction
        params['rcnamespace'] = none_or(namespace, int)
        params['rcuser'] = none_or(user, str)
        params['rcexcludeuser'] = none_or(excludeuser, str)
        params['rctag'] = none_or(tag, str)
        params['rcprop'] = self._items(properties, levels=self.PROPERTIES)
        params['rctoken'] = none_or(tag, str)
        params['rcshow'] = self._items(show, levels=self.SHOW)
        params['rclimit'] = none_or(limit, int)
        params['rctype'] = self._items(type, self.TYPES)
        params['rctoponly'] = none_or(toponly, bool)
        params['rccontinue'] = self._check_rccontinue(rccontinue)
        doc = self.session.get(params)
        try:
            rc_docs = doc['query']['recentchanges']
            if 'query-continue' in doc:
                rccontinue = \
                        doc['query-continue']['recentchanges']['rccontinue']
            elif len(rc_docs) > 0:
                rccontinue = "|".join([rc_docs[-1]['timestamp'],
                                       str(rc_docs[-1]['rcid'] + 1)])
            else:
                pass  # Leave it be
        except KeyError as e:
            raise MalformedResponse(str(e), doc)
        return rc_docs, rccontinue
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/revisions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/revisions.py
@@ -0,0 +1,220 @@
 import logging
 from ...util import none_or
 from ..errors import MalformedResponse
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.revisions")
 class Revisions(Collection):
    """
    A collection of revisions indexes by title, page_id and user_text.
    Note that revisions of deleted pages are queriable via
    :class:`mw.api.DeletedRevs`.
    """
    PROPERTIES = {'ids', 'flags', 'timestamp', 'user', 'userid', 'size',
                  'sha1', 'contentmodel', 'comment', 'parsedcomment',
                  'content', 'tags', 'flagged'}
    DIFF_TO = {'prev', 'next', 'cur'}
    # This is *not* the right way to do this, but it should work for all queries.
    MAX_REVISIONS = 50
    def get(self, rev_id, **kwargs):
        """
        Get a single revision based on it's ID.  Throws a :py:class:`KeyError`
        if the rev_id cannot be found.
        :Parameters:
            rev_id : int
                Revision ID
            ``**kwargs``
                Passed to :py:meth:`query`
        :Returns:
            A single rev dict
        """
        rev_id = int(rev_id)
        revs = list(self.query(revids={rev_id}, **kwargs))
        if len(revs) < 1:
            raise KeyError(rev_id)
        else:
            return revs[0]
    def query(self, *args, limit=None, **kwargs):
        """
        Get revision information.
        See `<https://www.mediawiki.org/wiki/API:Properties#revisions_.2F_rv>`_
        :Parameters:
            properties : set(str)
                Which properties to get for each revision:
                * ids            - The ID of the revision
                * flags          - Revision flags (minor)
                * timestamp      - The timestamp of the revision
                * user           - User that made the revision
                * userid         - User id of revision creator
                * size           - Length (bytes) of the revision
                * sha1           - SHA-1 (base 16) of the revision
                * contentmodel   - Content model id
                * comment        - Comment by the user for revision
                * parsedcomment  - Parsed comment by the user for the revision
                * content        - Text of the revision
                * tags           - Tags for the revision
            limit : int
                Limit how many revisions will be returned
                No more than 500 (5000 for bots) allowed
            start_id : int
                From which revision id to start enumeration (enum)
            end_id : int
                Stop revision enumeration on this revid
            start : :class:`mw.Timestamp`
                From which revision timestamp to start enumeration (enum)
            end : :class:`mw.Timestamp`
                Enumerate up to this timestamp
            direction : str
                "newer" or "older"
            user : str
                Only include revisions made by user_text
            excludeuser : bool
                Exclude revisions made by user
            tag : str
                Only list revisions tagged with this tag
            expandtemplates : bool
                Expand templates in revision content (requires "content" propery)
            generatexml : bool
                Generate XML parse tree for revision content (requires "content" propery)
            parse : bool
                Parse revision content (requires "content" propery)
            section : int
                Only retrieve the content of this section number
            token : set(str)
                Which tokens to obtain for each revision
                * rollback - See `<https://www.mediawiki.org/wiki/API:Edit_-_Rollback#Token>`_
            rvcontinue : str
                When more results are available, use this to continue
            diffto : int
                Revision ID to diff each revision to. Use "prev", "next" and
                "cur" for the previous, next and current revision respectively
            difftotext : str
                Text to diff each revision to. Only diffs a limited number of
                revisions. Overrides diffto. If section is set, only that
                section will be diffed against this text
            contentformat : str
                Serialization format used for difftotext and expected for output of content
                * text/x-wiki
                * text/javascript
                * text/css
                * text/plain
                * application/json
        :Returns:
            An iterator of rev dicts returned from the API.
        """
        revisions_yielded = 0
        done = False
        while not done:
            if limit == None:
                kwargs['limit'] = self.MAX_REVISIONS
            else:
                kwargs['limit'] = min(limit - revisions_yielded, self.MAX_REVISIONS)
            rev_docs, rvcontinue = self._query(*args, **kwargs)
            for doc in rev_docs:
                yield doc
                revisions_yielded += 1
                if limit != None and revisions_yielded >= limit:
                    done = True
                    break
            if rvcontinue != None and len(rev_docs) > 0:
                kwargs['rvcontinue'] = rvcontinue
            else:
                done = True
    def _query(self, revids=None, titles=None, pageids=None, properties=None,
                     limit=None, start_id=None, end_id=None, start=None,
                     end=None, direction=None, user=None, excludeuser=None,
                     tag=None, expandtemplates=None, generatexml=None,
                     parse=None, section=None, token=None, rvcontinue=None,
                     diffto=None, difftotext=None, contentformat=None):
        params = {
            'action': "query",
            'prop': "revisions",
            'rawcontinue': ''
        }
        params['revids'] = self._items(revids, type=int)
        params['titles'] = self._items(titles)
        params['pageids'] = self._items(pageids, type=int)
        params['rvprop'] = self._items(properties, levels=self.PROPERTIES)
        if revids == None: # Can't have a limit unless revids is none
            params['rvlimit'] = none_or(limit, int)
        params['rvstartid'] = none_or(start_id, int)
        params['rvendid'] = none_or(end_id, int)
        params['rvstart'] = self._check_timestamp(start)
        params['rvend'] = self._check_timestamp(end)
        params['rvdir'] = self._check_direction(direction)
        params['rvuser'] = none_or(user, str)
        params['rvexcludeuser'] = none_or(excludeuser, int)
        params['rvtag'] = none_or(tag, str)
        params['rvexpandtemplates'] = none_or(expandtemplates, bool)
        params['rvgeneratexml'] = none_or(generatexml, bool)
        params['rvparse'] = none_or(parse, bool)
        params['rvsection'] = none_or(section, int)
        params['rvtoken'] = none_or(token, str)
        params['rvcontinue'] = none_or(rvcontinue, str)
        params['rvdiffto'] = self._check_diffto(diffto)
        params['rvdifftotext'] = none_or(difftotext, str)
        params['rvcontentformat'] = none_or(contentformat, str)
        doc = self.session.get(params)
        try:
            if 'query-continue' in doc:
                rvcontinue = doc['query-continue']['revisions']['rvcontinue']
            else:
                rvcontinue = None
            pages = doc['query'].get('pages', {}).values()
            rev_docs = []
            for page_doc in pages:
                if 'missing' in page_doc or 'revisions' not in page_doc: continue
                page_rev_docs = page_doc['revisions']
                del page_doc['revisions']
                for rev_doc in page_rev_docs:
                    rev_doc['page'] = page_doc
                rev_docs.extend(page_rev_docs)
            return rev_docs, rvcontinue
        except KeyError as e:
            raise MalformedResponse(str(e), doc)
    def _check_diffto(self, diffto):
        if diffto == None or diffto in self.DIFF_TO:
            return diffto
        else:
            return int(diffto)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/site_info.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/site_info.py
@@ -0,0 +1,81 @@
 import logging
 from ..errors import MalformedResponse
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.site_info")
 class SiteInfo(Collection):
    """
    General information about the site.
    """
    PROPERTIES = {'general', 'namespaces', 'namespacealiases',
                  'specialpagealiases', 'magicwords', 'interwikimap',
                  'dbrepllag', 'statistics', 'usergroups', 'extensions',
                  'fileextensions', 'rightsinfo', 'languages', 'skins',
                  'extensiontags', 'functionhooks', 'showhooks',
                  'variables', 'protocols'}
    FILTERIW = {'local', '!local'}
    def query(self, properties=None, filteriw=None, showalldb=None,
              numberinggroup=None, inlanguagecode=None):
        """
        General information about the site.
        See `<https://www.mediawiki.org/wiki/API:Meta#siteinfo_.2F_si>`_
        :Parameters:
            properties: set(str)
                Which sysinfo properties to get:
                * general               - Overall system information
                * namespaces            - List of registered namespaces and their canonical names
                * namespacealiases      - List of registered namespace aliases
                * specialpagealiases    - List of special page aliases
                * magicwords            - List of magic words and their aliases
                * statistics            - Returns site statistics
                * interwikimap          - Returns interwiki map (optionally filtered, (optionally localised by using siinlanguagecode))
                * dbrepllag             - Returns database server with the highest replication lag
                * usergroups            - Returns user groups and the associated permissions
                * extensions            - Returns extensions installed on the wiki
                * fileextensions        - Returns list of file extensions allowed to be uploaded
                * rightsinfo            - Returns wiki rights (license) information if available
                * restrictions          - Returns information on available restriction (protection) types
                * languages             - Returns a list of languages MediaWiki supports(optionally localised by using siinlanguagecode)
                * skins                 - Returns a list of all enabled skins
                * extensiontags         - Returns a list of parser extension tags
                * functionhooks         - Returns a list of parser function hooks
                * showhooks             - Returns a list of all subscribed hooks (contents of $wgHooks)
                * variables             - Returns a list of variable IDs
                * protocols             - Returns a list of protocols that are allowed in external links.
                * defaultoptions        - Returns the default values for user preferences.
            filteriw : str
                "local" or "!local" Return only local or only nonlocal entries of the interwiki map
            showalldb : bool
                List all database servers, not just the one lagging the most
            numberingroup : bool
                Lists the number of users in user groups
            inlanguagecode : bool
                Language code for localised language names (best effort, use CLDR extension)
  """
        siprop = self._items(properties, levels=self.PROPERTIES)
        doc = self.session.get(
            {
                'action': "query",
                'meta': "siteinfo",
                'siprop': siprop,
                'sifilteriw': filteriw,
                'sishowalldb': showalldb,
                'sinumberinggroup': numberinggroup,
                'siinlanguagecode': inlanguagecode
            }
        )
        try:
            return doc['query']
        except KeyError as e:
            raise MalformedResponse(str(e), doc)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/user_contribs.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/user_contribs.py
@@ -0,0 +1,132 @@
 import logging
 from ...util import none_or
 from ..errors import MalformedResponse
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.user_contribs")
 class UserContribs(Collection):
    """
    A collection of revisions indexes by user.
    """
    PROPERTIES = {'ids', 'title', 'timestamp', 'comment', 'parsedcomment',
                  'size', 'sizediff', 'flags', 'patrolled', 'tags'}
    SHOW = {'minor', '!minor', 'patrolled', '!patrolled'}
    MAX_REVISIONS = 50
    def query(self, *args, limit=None, **kwargs):
        """
        Get a user's revisions.
        See `<https://www.mediawiki.org/wiki/API:Usercontribs>`_
        :Parameters:
            limit : int
                The maximum number of contributions to return.
            start : :class:`mw.Timestamp`
                The start timestamp to return from
            end : :class:`mw.Timestamp`
                The end timestamp to return to
            user : set(str)
                The users to retrieve contributions for.  Maximum number of values 50 (500 for bots)
            userprefix : set(str)
                Retrieve contributions for all users whose names begin with this value.
            direction : str
                "newer" or "older"
            namespace : int
                Only list contributions in these namespaces
            properties :
                Include additional pieces of information
                * ids            - Adds the page ID and revision ID
                * title          - Adds the title and namespace ID of the page
                * timestamp      - Adds the timestamp of the edit
                * comment        - Adds the comment of the edit
                * parsedcomment  - Adds the parsed comment of the edit
                * size           - Adds the new size of the edit
                * sizediff       - Adds the size delta of the edit against its parent
                * flags          - Adds flags of the edit
                * patrolled      - Tags patrolled edits
                * tags           - Lists tags for the edit
            show : set(str)
                Show only items that meet thse criteria, e.g. non minor edits only: ucshow=!minor.
                NOTE: If ucshow=patrolled or ucshow=!patrolled is set, revisions older than
                $wgRCMaxAge (2592000) won't be shown
                * minor
                * !minor,
                * patrolled,
                * !patrolled,
                * top,
                * !top,
                * new,
                * !new
            tag : str
                Only list revisions tagged with this tag
            toponly : bool
                DEPRECATED! Only list changes which are the latest revision
        """
        limit = none_or(limit, int)
        revisions_yielded = 0
        done = False
        while not done:
            if limit is None:
                kwargs['limit'] = self.MAX_REVISIONS
            else:
                kwargs['limit'] = min(limit - revisions_yielded, self.MAX_REVISIONS)
            uc_docs, uccontinue = self._query(*args, **kwargs)
            for doc in uc_docs:
                yield doc
                revisions_yielded += 1
                if limit is not None and revisions_yielded >= limit:
                    done = True
                    break
            if uccontinue is None or len(uc_docs) == 0:
                done = True
            else:
                kwargs['uccontinue'] = uccontinue
    def _query(self, user=None, userprefix=None, limit=None, start=None,
               end=None, direction=None, namespace=None, properties=None,
               show=None, tag=None, toponly=None,
               uccontinue=None):
        params = {
            'action': "query",
            'list': "usercontribs"
        }
        params['uclimit'] = none_or(limit, int)
        params['ucstart'] = self._check_timestamp(start)
        params['ucend'] = self._check_timestamp(end)
        if uccontinue is not None:
            params.update(uccontinue)
        params['ucuser'] = self._items(user, type=str)
        params['ucuserprefix'] = self._items(userprefix, type=str)
        params['ucdir'] = self._check_direction(direction)
        params['ucnamespace'] = none_or(namespace, int)
        params['ucprop'] = self._items(properties, levels=self.PROPERTIES)
        params['ucshow'] = self._items(show, levels=self.SHOW)
        doc = self.session.get(params)
        try:
            if 'query-continue' in doc:
                uccontinue = doc['query-continue']['usercontribs']
            else:
                uccontinue = None
            uc_docs = doc['query']['usercontribs']
            return uc_docs, uccontinue
        except KeyError as e:
            raise MalformedResponse(str(e), doc)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/users.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/collections/users.py
@@ -0,0 +1,83 @@
 import logging
 from ...util import none_or
 from ..errors import MalformedResponse
 from .collection import Collection
 logger = logging.getLogger("mw.api.collections.users")
 class Users(Collection):
    """
    A collection of information about users
    """
    PROPERTIES = {'blockinfo', 'implicitgroups', 'groups', 'registration',
                  'emailable', 'editcount', 'gender'}
    SHOW = {'minor', '!minor', 'patrolled', '!patrolled'}
    MAX_REVISIONS = 50
    def query(self, *args, **kwargs):
        """
        Get a user's metadata.
        See `<https://www.mediawiki.org/wiki/API:Users>`_
        :Parameters:
            users : str
                The usernames of the users to be retrieved.
            properties : set(str)
                Include additional pieces of information
                blockinfo      - Tags if the user is blocked, by whom, and
                                 for what reason
                groups         - Lists all the groups the user(s) belongs to
                implicitgroups - Lists all the groups a user is automatically
                                 a member of
                rights         - Lists all the rights the user(s) has
                editcount      - Adds the user's edit count
                registration   - Adds the user's registration timestamp
                emailable      - Tags if the user can and wants to receive
                                 email through [[Special:Emailuser]]
                gender         - Tags the gender of the user. Returns "male",
                                 "female", or "unknown"
        """
        done = False
        while not done:
            us_docs, query_continue = self._query(*args, **kwargs)
            for doc in us_docs:
                yield doc
            if query_continue is None or len(us_docs) == 0:
                done = True
            else:
                kwargs['query_continue'] = query_continue
    def _query(self, users, query_continue=None, properties=None):
        params = {
            'action': "query",
            'list': "users"
        }
        params['ususers'] = self._items(users, type=str)
        params['usprop'] = self._items(properties, levels=self.PROPERTIES)
        if query_continue is not None:
            params.update(query_continue)
        doc = self.session.get(params)
        try:
            if 'query-continue' in doc:
                query_continue = doc['query-continue']['users']
            else:
                query_continue = None
            us_docs = doc['query']['users']
            return us_docs, query_continue
        except KeyError as e:
            raise MalformedResponse(str(e), doc)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/errors.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/errors.py
@@ -0,0 +1,48 @@
 class DocError(Exception):
    def __init__(self, message, doc):
        super().__init__(message)
        self.doc = doc
        """
        The document returned by the API that brought about this error.
        """
 class APIError(DocError):
    def __init__(self, doc):
        code = doc.get('error', {}).get('code')
        message = doc.get('error', {}).get('message')
        super().__init__("{0}:{1}".format(code, message), doc)
        self.code = code
        """
        The error code returned by the api -- if available.
        """
        self.message = message
        """
        The error message returned by the api -- if available.
        """
 class AuthenticationError(DocError):
    def __init__(self, doc):
        result = doc['login']['result']
        super().__init__(result, doc)
        self.result = result
        """
        The result code of an authentication attempt.
        """
 class MalformedResponse(DocError):
    def __init__(self, key, doc):
        super().__init__("Expected to find '{0}' in result.".format(key), doc)
        self.key = key
        """
        The expected, but missing key from the API call.
        """
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/session.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/api/session.py
@@ -0,0 +1,134 @@
 import logging
 from ..util import api
 from .collections import (DeletedRevisions, Pages, RecentChanges, Revisions,
                          SiteInfo, UserContribs, Users)
 from .errors import APIError, AuthenticationError, MalformedResponse
 logger = logging.getLogger("mw.api.session")
 DEFAULT_USER_AGENT = "MediaWiki-Utilities"
 """
 The default User-Agent to be sent with requests to the API.
 """
 class Session(api.Session):
    """
    Represents a connection to a MediaWiki API.
    Cookies and other session information is preserved.
    :Parameters:
        uri : str
            The base URI for the API to use.  Usually ends in "api.php"
        user_agent : str
            The User-Agent to be sent with requests.  Will raise a warning if
            left to default value.
    """
    def __init__(self, uri, *args, user_agent=DEFAULT_USER_AGENT, **kwargs):
        """
        Constructs a new :class:`Session`.
        """
        if user_agent == DEFAULT_USER_AGENT:
            logger.warning("Sending requests with default User-Agent.  "  +
                           "Set 'user_agent' on api.Session to quiet this " +
                           "message.")
        if 'headers' in kwargs:
            kwargs['headers']['User-Agent'] = str(user_agent)
        else:
            kwargs['headers'] = {'User-Agent': str(user_agent)}
        super().__init__(uri, *args, **kwargs)
        self.pages = Pages(self)
        """
        An instance of :class:`mw.api.Pages`.
        """
        self.revisions = Revisions(self)
        """
        An instance of :class:`mw.api.Revisions`.
        """
        self.recent_changes = RecentChanges(self)
        """
        An instance of :class:`mw.api.RecentChanges`.
        """
        self.site_info = SiteInfo(self)
        """
        An instance of :class:`mw.api.SiteInfo`.
        """
        self.user_contribs = UserContribs(self)
        """
        An instance of :class:`mw.api.UserContribs`.
        """
        self.users = Users(self)
        """
        An instance of :class:`mw.api.Users`.
        """
        self.deleted_revisions = DeletedRevisions(self)
        """
        An instance of :class:`mw.api.DeletedRevisions`.
        """
    def login(self, username, password, token=None):
        """
        Performs a login operation.  This method usually makes two requests to
        API -- one to get a token and one to use the token to log in.  If
        authentication fails, this method will throw an
        :class:`.errors.AuthenticationError`.
        :Parameters:
            username : str
                Your username
            password : str
                Your password
        :Returns:
            The response in a json :py:class:`dict`
        """
        doc = self.post(
            {
                'action': "login",
                'lgname': username,
                'lgpassword': password,
                'lgtoken': token, # If None, we'll be getting a token
            }
        )
        try:
            if doc['login']['result'] == "Success":
                return doc
            elif doc['login']['result'] == "NeedToken":
                if token is not None:
                    # Woops.  We've been here before.  Better error out.
                    raise AuthenticationError(doc)
                else:
                    token = doc['login']['token']
                    return self.login(username, password, token=token)
            else:
                raise AuthenticationError(doc)
        except KeyError as e:
            raise MalformedResponse(e.message, doc)
    def request(self, type, params, **kwargs):
        params.update({'format': "json"})
        doc = super().request(type, params, **kwargs).json()
        if 'error' in doc:
            raise APIError(doc)
        return doc
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/init.py
@@ -0,0 +1,4 @@
 # from . import errors
 from .db import DB
 from .collections import Pages, RecentChanges, Revisions, Archives, \
    AllRevisions, Users
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/init.py
@@ -0,0 +1,4 @@
 from .pages import Pages
 from .recent_changes import RecentChanges
 from .revisions import Revisions, Archives, AllRevisions
 from .users import Users
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/collection.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/collection.py
@@ -0,0 +1,11 @@
 class Collection:
    DIRECTIONS = {'newer', 'older'}
    def __init__(self, db):
        self.db = db
    def __str__(self):
        return self.__repr__()
    def __repr__(self):
        return "{0}({1})".format(self.__class__.__name__, repr(self.db))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/pages.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/pages.py
@@ -0,0 +1,65 @@
 import logging
 from ...util import none_or
 from .collection import Collection
 logger = logging.getLogger("mw.database.collections.pages")
 class Pages(Collection):
    def get(self, page_id=None, namespace_title=None, rev_id=None):
        """
        Gets a single page based on a legitimate identifier of the page.  Note
        that namespace_title expects a tuple of namespace ID and title.
        :Parameters:
            page_id : int
                Page ID
            namespace_title : ( int, str )
                the page's namespace ID and title
            rev_id : int
                a revision ID included in the page's history
        :Returns:
            iterator over result rows
        """
        page_id = none_or(page_id, int)
        namespace_title = none_or(namespace_title, tuple)
        rev_id = none_or(rev_id, int)
        query = """
        SELECT page.*
        FROM page
        """
        values = []
        if page_id is not None:
            query += """
                WHERE page_id = %s
            """
            values.append(page_id)
        if namespace_title is not None:
            namespace, title = namespace_title
            query += " WHERE page_namespace = %s and page_title = %s "
            values.extend([int(namespace), str(title)])
        elif rev_id is not None:
            query += """
                WHERE page_id = (SELECT rev_page FROM revision WHERE rev_id = %s)
            """
            values.append(rev_id)
        else:
            raise TypeError("Must specify a page identifier.")
        cursor = self.db.shared_connection.cursor()
        cursor.execute(
            query,
            values
        )
        for row in cursor:
            return row
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/recent_changes.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/recent_changes.py
@@ -0,0 +1,128 @@
 import logging
 import time
 from ...types import Timestamp
 from ...util import none_or
 from .collection import Collection
 logger = logging.getLogger("mw.database.collections.pages")
 class RecentChanges(Collection):
    # (https://www.mediawiki.org/wiki/Manual:Recentchanges_table)
    TYPES = {
        'edit': 0,  # edit of existing page
        'new': 1,  # new page
        'move': 2,  # Marked as obsolete
        'log': 3,  # log action (introduced in MediaWiki 1.2)
        'move_over_redirect': 4,  # Marked as obsolete
        'external': 5  # An external recent change. Primarily used by Wikidata
    }
    def listen(self, last=None, types=None, max_wait=5):
        """
        Listens to the recent changes table.  Given no parameters, this function
        will return an iterator over the entire recentchanges table and then
        continue to "listen" for new changes to come in every 5 seconds.
        :Parameters:
            last : dict
                a recentchanges row to pick up after
            types : set ( str )
                a set of recentchanges types to filter for
            max_wait : float
                the maximum number of seconds to wait between repeated queries
        :Returns:
            A never-ending iterator over change rows.
        """
        while True:
            if last is not None:
                after = last['rc_timestamp']
                after_id = last['rc_id']
            else:
                after = None
                after_id = None
            start = time.time()
            rcs = self.query(after=after, after_id=after_id, direction="newer")
            count = 0
            for rc in rcs:
                yield rc
                count += 1
            time.sleep(max_wait - (time.time() - start))
    def query(self, before=None, after=None, before_id=None, after_id=None,
              types=None, direction=None, limit=None):
        """
        Queries the ``recentchanges`` table.  See
        `<https://www.mediawiki.org/wiki/Manual:Recentchanges_table>`_
        :Parameters:
            before : :class:`mw.Timestamp`
                The maximum timestamp
            after : :class:`mw.Timestamp`
                The minimum timestamp
            before_id : int
                The minimum ``rc_id``
            after_id : int
                The maximum ``rc_id``
            types : set ( str )
                Which types of changes to return?
                * ``edit`` -- Edits to existing pages
                * ``new`` -- Edits that create new pages
                * ``move`` -- (obsolete)
                * ``log`` -- Log actions (introduced in MediaWiki 1.2)
                * ``move_over_redirect`` -- (obsolete)
                * ``external`` -- An external recent change. Primarily used by Wikidata
            direction : str
                "older" or "newer"
            limit : int
                limit the number of records returned
        """
        before = none_or(before, Timestamp)
        after = none_or(after, Timestamp)
        before_id = none_or(before_id, int)
        after_id = none_or(after_id, int)
        types = none_or(types, levels=self.TYPES)
        direction = none_or(direction, levels=self.DIRECTIONS)
        limit = none_or(limit, int)
        query = """
            SELECT * FROM recentchanges
            WHERE 1
        """
        values = []
        if before is not None:
            query += " AND rc_timestamp < %s "
            values.append(before.short_format())
        if after is not None:
            query += " AND rc_timestamp < %s "
            values.append(after.short_format())
        if before_id is not None:
            query += " AND rc_id < %s "
            values.append(before_id)
        if after_id is not None:
            query += " AND rc_id < %s "
            values.append(after_id)
        if types is not None:
            query += " AND rc_type IN ({0}) ".format(
                ",".join(self.TYPES[t] for t in types)
            )
        if direction is not None:
            direction = ("ASC " if direction == "newer" else "DESC ")
            query += " ORDER BY rc_timestamp {0}, rc_id {0}".format(dir)
        if limit is not None:
            query += " LIMIT %s "
            values.append(limit)
        cursor.execute(query, values)
        for row in cursor:
            yield row
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/revisions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/revisions.py
@@ -0,0 +1,410 @@
 import logging
 import time
 from itertools import chain
 from ...types import Timestamp
 from ...util import iteration, none_or
 from .collection import Collection
 logger = logging.getLogger("mw.database.collections.revisions")
 class AllRevisions(Collection):
    def get(self, rev_id, include_page=False):
        """
        Gets a single revisions by ID.  Checks both the ``revision`` and
        ``archive`` tables.  This method throws a :class:`KeyError` if a
        revision cannot be found.
        :Parameters:
            rev_id : int
                Revision ID
            include_page : bool
                Join revision returned against ``page``
        :Returns:
            A revision row
        """
        rev_id = int(rev_id)
        try:
            rev_row = self.db.revisions.get(rev_id, include_page=include_page)
        except KeyError as e:
            rev_row = self.db.archives.get(rev_id)
        return rev_row
    def query(self, *args, **kwargs):
        """
        Queries revisions (excludes revisions to deleted pages)
        :Parameters:
            page_id : int
                Page identifier.  Filter revisions to this page.
            user_id : int
                User identifier.  Filter revisions to those made by this user.
            user_text : str
                User text (user_name or IP address).  Filter revisions to those
                made by this user.
            before : :class:`mw.Timestamp`
                Filter revisions to those made before this timestamp.
            after : :class:`mw.Timestamp`
                Filter revisions to those made after this timestamp.
            before_id : int
                Filter revisions to those with an ID before this ID
            after_id : int
                Filter revisions to those with an ID after this ID
            direction : str
                "newer" or "older"
            limit : int
                Limit the number of results
            include_page : bool
                Join revisions returned against ``page``
        :Returns:
            An iterator over revision rows.
        """
        revisions = self.db.revisions.query(*args, **kwargs)
        archives = self.db.archives.query(*args, **kwargs)
        if 'direction' in kwargs:
            direction = kwargs['direction']
            if direction not in self.DIRECTIONS:
                raise TypeError("direction must be in {0}".format(self.DIRECTIONS))
            if direction == "newer":
                collated_revisions = iteration.sequence(
                    revisions,
                    archives,
                    compare=lambda r1, r2:\
                            (r1['rev_timestamp'], r1['rev_id']) <=
                            (r2['rev_timestamp'], r2['rev_id'])
                )
            else:  # direction == "older"
                collated_revisions = iteration.sequence(
                    revisions,
                    archives,
                    compare=lambda r1, r2:\
                            (r1['rev_timestamp'], r1['rev_id']) >=
                            (r2['rev_timestamp'], r2['rev_id'])
                )
        else:
            collated_revisions = chain(revisions, archives)
        if 'limit' in kwargs:
            limit = kwargs['limit']
            for i, rev in enumerate(collated_revisions):
                yield rev
                if i >= limit:
                    break
        else:
            for rev in collated_revisions:
                yield rev
 class Revisions(Collection):
    def get(self, rev_id, include_page=False):
        """
        Gets a single revisions by ID.  Checks the ``revision`` table.   This
        method throws a :class:`KeyError` if a revision cannot be found.
        :Parameters:
            rev_id : int
                Revision ID
            include_page : bool
                Join revision returned against ``page``
        :Returns:
            A revision row
        """
        rev_id = int(rev_id)
        query = """
            SELECT *, FALSE AS archived FROM revision
        """
        if include_page:
            query += """
                INNER JOIN page ON page_id = rev_page
            """
        query += " WHERE rev_id = %s"
        cursor.execute(query, [rev_id])
        for row in cursor:
            return row
        raise KeyError(rev_id)
    def query(self, page_id=None, user_id=None, user_text=None,
              before=None, after=None, before_id=None, after_id=None,
              direction=None, limit=None, include_page=False):
        """
        Queries revisions (excludes revisions to deleted pages)
        :Parameters:
            page_id : int
                Page identifier.  Filter revisions to this page.
            user_id : int
                User identifier.  Filter revisions to those made by this user.
            user_text : str
                User text (user_name or IP address).  Filter revisions to those
                made by this user.
            before : :class:`mw.Timestamp`
                Filter revisions to those made before this timestamp.
            after : :class:`mw.Timestamp`
                Filter revisions to those made after this timestamp.
            before_id : int
                Filter revisions to those with an ID before this ID
            after_id : int
                Filter revisions to those with an ID after this ID
            direction : str
                "newer" or "older"
            limit : int
                Limit the number of results
            include_page : bool
                Join revisions returned against ``page``
        :Returns:
            An iterator over revision rows.
        """
        start_time = time.time()
        page_id = none_or(page_id, int)
        user_id = none_or(user_id, int)
        user_text = none_or(user_text, str)
        before = none_or(before, Timestamp)
        after = none_or(after, Timestamp)
        before_id = none_or(before_id, int)
        after_id = none_or(after_id, int)
        direction = none_or(direction, levels=self.DIRECTIONS)
        include_page = bool(include_page)
        query = """
            SELECT *, FALSE AS archived FROM revision
        """
        if include_page:
            query += """
                INNER JOIN page ON page_id = rev_page
            """
        query += """
            WHERE 1
        """
        values = []
        if page_id is not None:
            query += " AND rev_page = %s "
            values.append(page_id)
        if user_id is not None:
            query += " AND rev_user = %s "
            values.append(user_id)
        if user_text is not None:
            query += " AND rev_user_text = %s "
            values.append(user_text)
        if before is not None:
            query += " AND rev_timestamp < %s "
            values.append(before.short_format())
        if after is not None:
            query += " AND rev_timestamp > %s "
            values.append(after.short_format())
        if before_id is not None:
            query += " AND rev_id < %s "
            values.append(before_id)
        if after_id is not None:
            query += " AND rev_id > %s "
            values.append(after_id)
        if direction is not None:
            direction = ("ASC " if direction == "newer" else "DESC ")
            if before_id != None or after_id != None:
                query += " ORDER BY rev_id {0}, rev_timestamp {0}".format(direction)
            else:
                query += " ORDER BY rev_timestamp {0}, rev_id {0}".format(direction)
        if limit is not None:
            query += " LIMIT %s "
            values.append(limit)
        cursor = self.db.shared_connection.cursor()
        cursor.execute(query, values)
        count = 0
        for row in cursor:
            yield row
            count += 1
        logger.debug("%s revisions read in %s seconds" % (count, time.time() - start_time))
 class Archives(Collection):
    def get(self, rev_id):
        """
        Gets a single revisions by ID.  Checks the ``archive`` table. This
        method throws a :class:`KeyError` if a revision cannot be found.
        :Parameters:
            rev_id : int
                Revision ID
        :Returns:
            A revision row
        """
        rev_id = int(rev_id)
        query = """
            SELECT
                ar_id,
                ar_rev_id AS rev_id,
                ar_page_id AS rev_page,
                ar_page_id AS page_id,
                ar_title AS page_title,
                ar_namespace AS page_namespace,
                ar_text_id AS rev_text_id,
                ar_comment AS rev_comment,
                ar_user AS rev_user,
                ar_user_text AS rev_user_text,
                ar_timestamp AS rev_timestamp,
                ar_minor_edit AS rev_minor_edit,
                ar_deleted AS rev_deleted,
                ar_len AS rev_len,
                ar_parent_id AS rev_parent_id,
                ar_sha1 AS rev_sha1,
                TRUE AS archived
            FROM archive
            WHERE ar_rev_id = %s
        """
        cursor.execute(query, [rev_id])
        for row in cursor:
            return row
        raise KeyError(rev_id)
    def query(self, page_id=None, user_id=None, user_text=None,
              before=None, after=None, before_id=None, after_id=None,
              before_ar_id=None, after_ar_id=None,
              direction=None, limit=None, include_page=True):
        """
        Queries archived revisions (revisions of deleted pages)
        :Parameters:
            page_id : int
                Page identifier.  Filter revisions to this page.
            user_id : int
                User identifier.  Filter revisions to those made by this user.
            user_text : str
                User text (user_name or IP address).  Filter revisions to those
                made by this user.
            before : :class:`mw.Timestamp`
                Filter revisions to those made before this timestamp.
            after : :class:`mw.Timestamp`
                Filter revisions to those made after this timestamp.
            before_id : int
                Filter revisions to those with an ID before this ID
            after_id : int
                Filter revisions to those with an ID after this ID
            direction : str
                "newer" or "older"
            limit : int
                Limit the number of results
            include_page : bool
                This field is ignored.  It's only here for compatibility with
                :class:`mw.database.Revision`.
        :Returns:
            An iterator over revision rows.
        """
        page_id = none_or(page_id, int)
        user_id = none_or(user_id, int)
        before = none_or(before, Timestamp)
        after = none_or(after, Timestamp)
        before_id = none_or(before_id, int)
        after_id = none_or(after_id, int)
        direction = none_or(direction, levels=self.DIRECTIONS)
        limit = none_or(limit, int)
        start_time = time.time()
        cursor = self.db.shared_connection.cursor()
        query = """
            SELECT
                ar_id,
                ar_rev_id AS rev_id,
                ar_page_id AS rev_page,
                ar_page_id AS page_id,
                ar_title AS page_title,
                ar_namespace AS page_namespace,
                ar_text_id AS rev_text_id,
                ar_comment AS rev_comment,
                ar_user AS rev_user,
                ar_user_text AS rev_user_text,
                ar_timestamp AS rev_timestamp,
                ar_minor_edit AS rev_minor_edit,
                ar_deleted AS rev_deleted,
                ar_len AS rev_len,
                ar_parent_id AS rev_parent_id,
                ar_sha1 AS rev_sha1,
                TRUE AS archived
            FROM archive
        """
        query += """
            WHERE 1
        """
        values = []
        if page_id is not None:
            query += " AND ar_page_id = %s "
            values.append(page_id)
        if user_id is not None:
            query += " AND ar_user = %s "
            values.append(user_id)
        if user_text is not None:
            query += " AND ar_user_text = %s "
            values.append(user_text)
        if before is not None:
            query += " AND ar_timestamp < %s "
            values.append(before.short_format())
        if after is not None:
            query += " AND ar_timestamp > %s "
            values.append(after.short_format())
        if before_id is not None:
            query += " AND ar_rev_id < %s "
            values.append(before_id)
        if after_id is not None:
            query += " AND ar_rev_id > %s "
            values.append(after_id)
        if before_ar_id is not None:
            query += " AND ar_id < ? "
            values.append(before_ar_id)
        if after_ar_id is not None:
            query += " AND ar_id > ? "
            values.append(after_ar_id)
        if direction is not None:
            dir = ("ASC " if direction == "newer" else "DESC ")
            if before is not None or after is not None:
                query += " ORDER BY ar_timestamp {0}, ar_rev_id {0}".format(dir)
            elif before_id is not None or after_id is not None:
                query += " ORDER BY ar_rev_id {0}, ar_timestamp {0}".format(dir)
            else:
                query += " ORDER BY ar_id {0}".format(dir)
        if limit is not None:
            query += " LIMIT %s "
            values.append(limit)
        cursor.execute(query, values)
        count = 0
        for row in cursor:
            yield row
            count += 1
        logger.debug("%s revisions read in %s seconds" % (count, time.time() - start_time))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/users.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/collections/users.py
@@ -0,0 +1,154 @@
 import logging
 import time
 from ...types import Timestamp
 from ...util import none_or
 from .collection import Collection
 logger = logging.getLogger("mw.database.collections.users")
 class Users(Collection):
    CREATION_ACTIONS = {'newusers', 'create', 'create2', 'autocreate',
                        'byemail'}
    def get(self, user_id=None, user_name=None):
        """
        Gets a single user row from the database.  Raises a :class:`KeyError`
        if a user cannot be found.
        :Parameters:
            user_id : int
                User ID
            user_name : str
                User's name
        :Returns:
            A user row.
        """
        user_id = none_or(user_id, int)
        user_name = none_or(user_name, str)
        query = """
            SELECT user.*
            FROM user
        """
        values = []
        if user_id is not None:
            query += """
                WHERE user_id = %s
            """
            values.append(user_id)
        elif user_name is not None:
            query += """
                WHERE user_name = %s
            """
            values.append(user_name)
        else:
            raise TypeError("Must specify a user identifier.")
        cursor = self.db.shared_connection.cursor()
        cursor.execute(
            query,
            values
        )
        for row in cursor:
            return row
        raise KeyError(user_id if user_id is not None else user_name)
    def query(self, registered_before=None, registered_after=None,
              before_id=None, after_id=None, limit=None,
              direction=None, self_created_only=False):
        """
        Queries users based on various filtering parameters.
        :Parameters:
            registered_before : :class:`mw.Timestamp`
                A timestamp to search before (inclusive)
            registered_after : :class:`mw.Timestamp`
                A timestamp to search after (inclusive)
            before_id : int
                A user_id to search before (inclusive)
            after_id : int
                A user_ud to search after (inclusive)
            direction : str
                "newer" or "older"
            limit : int
                Limit the results to at most this number
            self_creations_only : bool
                limit results to self_created user accounts
        :Returns:
            an iterator over ``user`` table rows
        """
        start_time = time.time()
        registered_before = none_or(registered_before, Timestamp)
        registered_after = none_or(registered_after, Timestamp)
        before_id = none_or(before_id, str)
        after_id = none_or(after_id, str)
        direction = none_or(direction, levels=self.DIRECTIONS)
        limit = none_or(limit, int)
        self_created_only = bool(self_created_only)
        query = """
            SELECT user.*
            FROM user
        """
        values = []
        if self_created_only:
            query += """
                INNER JOIN logging ON
                    log_user = user_id
                    log_type = "newusers" AND
                    log_action = "create"
            """
        query += "WHERE 1 "
        if registered_before is not None:
            query += "AND user_registration <= %s "
            values.append(registered_before.short_format())
        if registered_after is not None:
            query += "AND user_registration >= %s "
            values.append(registered_after.short_format())
        if before_id is not None:
            query += "AND user_id <= %s "
            values.append(before_id)
        if after_id is not None:
            query += "AND user_id >= %s "
            values.append(after_id)
        query += "GROUP BY user_id "  # In case of duplicate log events
        if direction is not None:
            if registered_before is not None or registered_after is not None:
                if direction == "newer":
                    query += "ORDER BY user_registration ASC "
                else:
                    query += "ORDER BY user_registration DESC "
            else:
                if direction == "newer":
                    query += "ORDER BY user_id ASC "
                else:
                    query += "ORDER BY user_id DESC "
        if limit is not None:
            query += "LIMIT %s "
            values.append(limit)
        cursor = self.db.shared_connection.cursor()
        cursor.execute(query, values)
        count = 0
        for row in cursor:
            yield row
            count += 1
        logger.debug("%s users queried in %s seconds" % (count, time.time() - start_time))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/db.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/database/db.py
@@ -0,0 +1,134 @@
 import getpass
 import logging
 import os
 import pymysql
 import pymysql.cursors
 from .collections import AllRevisions, Archives, Pages, Revisions, Users
 logger = logging.getLogger("mw.database.db")
 class DB:
    """
    Represents a connection to a MySQL database.
    :Parameters:
        connection = :class:`oursql.Connection`
            A connection to a MediaWiki database
    """
    def __init__(self, connection):
        self.shared_connection = connection
        self.shared_connection.cursorclass = pymysql.cursors.DictCursor
        self.revisions = Revisions(self)
        """
        An instance of :class:`mw.database.Revisions`.
        """
        self.archives = Archives(self)
        """
        An instance of :class:`mw.database.Archives`.
        """
        self.all_revisions = AllRevisions(self)
        """
        An instance of :class:`mw.database.AllRevisions`.
        """
        self.pages = Pages(self)
        """
        An instance of :class:`mw.database.Pages`.
        """
        self.users = Users(self)
        """
        An instance of :class:`mw.database.Users`.
        """
    def __repr__(self):
        return "%s(%s)" % (
            self.__class__.__name__,
            ", ".join(
                [repr(arg) for arg in self.args] +
                ["%s=%r" % (k, v) for k, v in self.kwargs.items()]
            )
        )
    def __str__(self):
        return self.__repr__()
    @classmethod
    def add_arguments(cls, parser, defaults=None):
        """
        Adds the arguments to an :class:`argparse.ArgumentParser` in order to
        create a database connection.
        """
        defaults = defaults if defaults is not None else defaults
        default_host = defaults.get('host', "localhost")
        parser.add_argument(
            '--host', '-h',
            help="MySQL database host to connect to (defaults to {0})".format(default_host),
            default=default_host
        )
        default_database = defaults.get('database', getpass.getuser())
        parser.add_argument(
            '--database', '-d',
            help="MySQL database name to connect to (defaults to  {0})".format(default_database),
            default=default_database
        )
        default_defaults_file = defaults.get('defaults-file', os.path.expanduser("~/.my.cnf"))
        parser.add_argument(
            '--defaults-file',
            help="MySQL defaults file (defaults to {0})".format(default_defaults_file),
            default=default_defaults_file
        )
        default_user = defaults.get('user', getpass.getuser())
        parser.add_argument(
            '--user', '-u',
            help="MySQL user (defaults to %s)".format(default_user),
            default=default_user
        )
        return parser
    @classmethod
    def from_arguments(cls, args):
        """
        Constructs a :class:`~mw.database.DB`.
        Consumes :class:`argparse.ArgumentParser` arguments given by
        :meth:`add_arguments` in order to create a :class:`DB`.
        :Parameters:
            args : :class:`argparse.Namespace`
                A collection of argument values returned by :class:`argparse.ArgumentParser`'s :meth:`parse_args()`
        """
        connection = pymysql.connect(
            args.host,
            args.user,
            database=args.database,
            read_default_file=args.defaults_file
        )
        return cls(connection)
    @classmethod
    def from_params(cls, *args, **kwargs):
        """
        Constructs a :class:`~mw.database.DB`.  Passes `*args` and `**kwargs`
        to :meth:`oursql.connect` and configures the connection.
        :Parameters:
            args : :class:`argparse.Namespace`
                A collection of argument values returned by :class:`argparse.ArgumentParser`'s :meth:`parse_args()`
        """
        kwargs['cursorclass'] = pymysql.cursors.DictCursor
        if kwargs['db']:
            kwargs['database'] = kwargs['db']
            del kwargs['db']
        connection = pymysql.connect(*args, **kwargs)
        return cls(connection)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/init.py
@@ -0,0 +1,14 @@
 """
 A package with utilities for managing the persistent word analysis across text
 versions of a document.  `PersistenceState` is the highest level of the
 interface and the part of the system that's most interesting externally.  `Word`s
 are also very important.  The current implementation of `Word` only accounts for
 how the number of revisions in which a Word is visible.  If persistent word
 views (or something similar) is intended to be kept, refactoring will be
 necessary.
 """
 from .state import State
 from .tokens import Tokens, Token
 from . import defaults
 from . import api
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py
@@ -0,0 +1,85 @@
 from .. import reverts
 from ...util import none_or
 from .state import State
 def track(session, rev_id, page_id=None, revert_radius=reverts.defaults.RADIUS,
          future_revisions=reverts.defaults.RADIUS, properties=None):
    """
    Computes a persistence score for a revision by processing the revisions
    that took place around it.
    :Parameters:
        session : :class:`mw.api.Session`
            An API session to make use of
        rev_id : int
            the ID of the revision to check
        page_id : int
            the ID of the page the revision occupies (slower if not provided)
        revert_radius : int
            a positive integer indicating the maximum number of revisions that can be reverted
    """
    if not hasattr(session, "revisions"):
        raise TypeError("session is wrong type.  Expected a mw.api.Session.")
    rev_id = int(rev_id)
    page_id = none_or(page_id, int)
    revert_radius = int(revert_radius)
    if revert_radius < 1:
        raise TypeError("invalid radius.  Expected a positive integer.")
    properties = set(properties) if properties is not None else set()
    # If we don't have the page_id, we're going to need to look them up
    if page_id is None:
        rev = session.revisions.get(rev_id, properties={'ids'})
        page_id = rev['page']['pageid']
    # Load history and current rev
    current_and_past_revs = list(session.revisions.query(
        pageids={page_id},
        limit=revert_radius + 1,
        start_id=rev_id,
        direction="older",
        properties={'ids', 'timestamp', 'content', 'sha1'} | properties
    ))
    try:
        # Extract current rev and reorder history
        current_rev, past_revs = (
            current_and_past_revs[0],  # Current rev is the first one returned
            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
        )
    except IndexError:
        # Only way to get here is if there isn't enough history.  Couldn't be
        # reverted.  Just return None.
        return None
    # Load future revisions
    future_revs = session.revisions.query(
        pageids={page_id},
        limit=future_revisions,
        start_id=rev_id + 1, # Ensures that we skip the current revision
        direction="newer",
        properties={'ids', 'timestamp', 'content', 'sha1'} | properties
    )
    state = State(revert_radius=revert_radius)
    # Process old revisions
    for rev in past_revs:
        state.process(rev.get('*', ""), rev, rev.get('sha1'))
    # Process current revision
    _, tokens_added, _ = state.process(current_rev.get('*'), current_rev,
                                         current_rev.get('sha1'))
    # Process new revisions
    future_revs = list(future_revs)
    for rev in future_revs:
        state.process(rev.get('*', ""), rev, rev.get('sha1'))
    return current_rev, tokens_added, future_revs
 score = track
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py
@@ -0,0 +1,11 @@
 from . import tokenization, difference
 TOKENIZE = tokenization.wikitext_split
 """
 The standard tokenizing function.
 """
 DIFF = difference.sequence_matcher
 """
 The standard diff function
 """
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py
@@ -0,0 +1,49 @@
 from difflib import SequenceMatcher
 def sequence_matcher(old, new):
    """
    Generates a sequence of operations using :class:`difflib.SequenceMatcher`.
    :Parameters:
        old : list( `hashable` )
            Old tokens
        new : list( `hashable` )
            New tokens
    Returns:
        Minimal operations needed to convert `old` to `new`
    """
    sm = SequenceMatcher(None, list(old), list(new))
    return sm.get_opcodes()
 def apply(ops, old, new):
    """
    Applies operations (delta) to copy items from `old` to `new`.
    :Parameters:
        ops : list((op, a1, a2, b1, b2))
            Operations to perform
        old : list( `hashable` )
            Old tokens
        new : list( `hashable` )
            New tokens
    :Returns:
        An iterator over elements matching `new` but copied from `old`
    """
    for code, a_start, a_end, b_start, b_end in ops:
        if code == "insert":
            for t in new[b_start:b_end]:
                yield t
        elif code == "replace":
            for t in new[b_start:b_end]:
                yield t
        elif code == "equal":
            for t in old[a_start:a_end]:
                yield t
        elif code == "delete":
            pass
        else:
            assert False, \
                "encounted an unrecognized operation code: " + repr(code)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py
@@ -0,0 +1,149 @@
 from hashlib import sha1
 from . import defaults
 from .. import reverts
 from .tokens import Token, Tokens
 class Version:
    __slots__ = ('tokens')
    def __init__(self):
        self.tokens = None
 class State:
    """
    Represents the state of word persistence in a page.
    See `<https://meta.wikimedia.org/wiki/Research:Content_persistence>`_
    :Parameters:
        tokenize : function( `str` ) --> list( `str` )
            A tokenizing function
        diff : function(list( `str` ), list( `str` )) --> list( `ops` )
            A function to perform a difference between token lists
        revert_radius : int
            a positive integer indicating the maximum revision distance that a revert can span.
        revert_detector : :class:`mw.lib.reverts.Detector`
            a revert detector to start process with
    :Example:
        >>> from pprint import pprint
        >>> from mw.lib import persistence
        >>>
        >>> state = persistence.State()
        >>>
        >>> pprint(state.process("Apples are red.", revision=1))
        ([Token(text='Apples', revisions=[1]),
          Token(text=' ', revisions=[1]),
          Token(text='are', revisions=[1]),
          Token(text=' ', revisions=[1]),
          Token(text='red', revisions=[1]),
          Token(text='.', revisions=[1])],
         [Token(text='Apples', revisions=[1]),
          Token(text=' ', revisions=[1]),
          Token(text='are', revisions=[1]),
          Token(text=' ', revisions=[1]),
          Token(text='red', revisions=[1]),
          Token(text='.', revisions=[1])],
         [])
        >>> pprint(state.process("Apples are blue.", revision=2))
        ([Token(text='Apples', revisions=[1, 2]),
          Token(text=' ', revisions=[1, 2]),
          Token(text='are', revisions=[1, 2]),
          Token(text=' ', revisions=[1, 2]),
          Token(text='blue', revisions=[2]),
          Token(text='.', revisions=[1, 2])],
         [Token(text='blue', revisions=[2])],
         [Token(text='red', revisions=[1])])
        >>> pprint(state.process("Apples are red.", revision=3)) # A revert!
        ([Token(text='Apples', revisions=[1, 2, 3]),
          Token(text=' ', revisions=[1, 2, 3]),
          Token(text='are', revisions=[1, 2, 3]),
          Token(text=' ', revisions=[1, 2, 3]),
          Token(text='red', revisions=[1, 3]),
          Token(text='.', revisions=[1, 2, 3])],
         [],
         [])
    """
    def __init__(self, tokenize=defaults.TOKENIZE, diff=defaults.DIFF,
                 revert_radius=reverts.defaults.RADIUS,
                 revert_detector=None):
        self.tokenize = tokenize
        self.diff = diff
        # Either pass a detector or the revert radius so I can make one
        if revert_detector is None:
            self.revert_detector = reverts.Detector(int(revert_radius))
        else:
            self.revert_detector = revert_detector
        # Stores the last tokens
        self.last = None
    def process(self, text, revision=None, checksum=None):
        """
        Modifies the internal state based a change to the content and returns
        the sets of words added and removed.
        :Parameters:
            text : str
                The text content of a revision
            revision : `mixed`
                Revision meta data
            checksum : str
                A checksum hash of the text content (will be generated if not provided)
        :Returns:
            Three :class:`~mw.lib.persistence.Tokens` lists
            current_tokens : :class:`~mw.lib.persistence.Tokens`
                A sequence of :class:`~mw.lib.persistence.Token` for the
                processed revision
            tokens_added : :class:`~mw.lib.persistence.Tokens`
                A set of tokens that were inserted by the processed revision
            tokens_removed : :class:`~mw.lib.persistence.Tokens`
                A sequence of :class:`~mw.lib.persistence.Token` removed by the
                processed revision
        """
        if checksum is None:
            checksum = sha1(bytes(text, 'utf8')).hexdigest()
        version = Version()
        revert = self.revert_detector.process(checksum, version)
        if revert is not None:  # Revert
            # Empty words.
            tokens_added = Tokens()
            tokens_removed = Tokens()
            # Extract reverted_to revision
            _, _, reverted_to = revert
            version.tokens = reverted_to.tokens
        else:
            if self.last is None:  # First version of the page!
                version.tokens = Tokens(Token(t) for t in self.tokenize(text))
                tokens_added = version.tokens
                tokens_removed = Tokens()
            else:
                # NOTICE: HEAVY COMPUTATION HERE!!!
                #
                # OK.  It's not that heavy.  It's just performing a diff,
                # but you're still going to spend most of your time here.
                # Diffs usually run in O(n^2) -- O(n^3) time and most tokenizers
                # produce a lot of tokens.
                version.tokens, tokens_added, tokens_removed = \
                    self.last.tokens.compare(self.tokenize(text), self.diff)
        version.tokens.persist(revision)
        self.last = version
        return version.tokens, tokens_added, tokens_removed
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py
@@ -0,0 +1,12 @@
 from nose.tools import eq_
 from .. import difference
 def test_sequence_matcher():
    t1 = "foobar derp hepl derpl"
    t2 = "fooasldal 3 hepl asl a derpl"
    ops = difference.sequence_matcher(t1, t2)
    eq_("".join(difference.apply(ops, t1, t2)), t2)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py
@@ -0,0 +1,25 @@
 from nose.tools import eq_
 from ..state import State
 def test_state():
    contents_revisions = [
        ("Apples are red.", 0),
        ("Apples are blue.", 1),
        ("Apples are red.", 2),
        ("Apples are tasty and red.", 3),
        ("Apples are tasty and blue.", 4)
    ]
    state = State()
    token_sets = [state.process(c, r) for c, r in contents_revisions]
    for i, (content, revision) in enumerate(contents_revisions):
        eq_("".join(token_sets[i][0].texts()), content)
    eq_(token_sets[0][0][0].text, "Apples")
    eq_(len(token_sets[0][0][0].revisions), 5)
    eq_(token_sets[0][0][4].text, "red")
    eq_(len(token_sets[0][0][4].revisions), 3)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py
@@ -0,0 +1,10 @@
 from nose.tools import eq_
 from .. import tokenization
 def test_wikitext_split():
    eq_(
        list(tokenization.wikitext_split("foo bar herp {{derp}}")),
        ["foo", " ", "bar", " ", "herp", " ", "{{", "derp", "}}"]
    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py
@@ -0,0 +1,16 @@
 import re
 def wikitext_split(text):
    """
    Performs the simplest possible split of latin character-based languages
    and wikitext.
    :Parameters:
        text : str
            Text to split.
    """
    return re.findall(
        r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
        text
    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py
@@ -0,0 +1,98 @@
 class Token:
    """
    Represents a chunk of text and the revisions of a page that it survived.
    """
    __slots__ = ('text', 'revisions')
    def __init__(self, text, revisions=None):
        self.text = text
        """
        The text of the token.
        """
        self.revisions = revisions if revisions is not None else []
        """
        The meta data for the revisions that the token has appeared within.
        """
    def persist(self, revision):
        self.revisions.append(revision)
    def __repr__(self):
        return "{0}({1})".format(
            self.__class__.__name__,
            ", ".join([
                "text={0}".format(repr(self.text)),
                "revisions={0}".format(repr(self.revisions))
            ])
        )
 class Tokens(list):
    """
    Represents a :class:`list` of :class:`~mw.lib.persistence.Token` with some
    useful helper functions.
    :Example:
        >>> from mw.lib.persistence import Token, Tokens
        >>>
        >>> tokens = Tokens()
        >>> tokens.append(Token("foo"))
        >>> tokens.extend([Token(" "), Token("bar")])
        >>>
        >>> tokens[0]
        Token(text='foo', revisions=[])
        >>>
        >>> "".join(tokens.texts())
        'foo bar'
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def persist(self, revision):
        for token in self:
            token.persist(revision)
    def texts(self):
        for token in self:
            yield token.text
    def compare(self, new, diff):
        old = self.texts()
        return self.apply_diff(diff(old, new), self, new)
    @classmethod
    def apply_diff(cls, ops, old, new):
        tokens = cls()
        tokens_added = cls()
        tokens_removed = cls()
        for code, a_start, a_end, b_start, b_end in ops:
            if code == "insert":
                for token_text in new[b_start:b_end]:
                    token = Token(token_text)
                    tokens.append(token)
                    tokens_added.append(token)
            elif code == "replace":
                for token_text in new[b_start:b_end]:
                    token = Token(token_text)
                    tokens.append(token)
                    tokens_added.append(token)
                tokens_removed.extend(t for t in old[a_start:a_end])
            elif code == "equal":
                tokens.extend(old[a_start:a_end])
            elif code == "delete":
                tokens_removed.extend(old[a_start:a_end])
            else:
                assert False, \
                    "encounted an unrecognized operation code: " + repr(code)
        return (tokens, tokens_added, tokens_removed)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/init.py
@@ -0,0 +1,24 @@
 """
 This module provides a set of utilities for detecting identity reverts in
 revisioned content.
 To detect reverts in a stream of revisions to a single page, you can use
 :func:`detect`.  If you'll be detecting reverts in a collection of pages or
 would, for some other reason, prefer to process revisions one at a time,
 :class:`Detector` and it's :meth:`~Detector.process` will allow you to do so.
 To detect reverts one-at-time and arbitrarily, you can user the `check()`
 functions:
 * :func:`database.check` and :func:`database.check_row` use a :class:`mw.database.DB`
 * :func:`api.check` and :func:`api.check_rev` use a :class:`mw.api.Session`
 Note that these functions are less performant than detecting reverts in a
 stream of page revisions.  This can be practical when trying to identify
 reverted revisions in a user's contribution history.
 """
 from .detector import Detector, Revert
 from .functions import detect, reverts
 from . import database
 from . import api
 from . import defaults
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`# Generated by roxygen2: fake comment so roxygen2 overwrites silently.`
							`exportPattern("^[^\\.]")`
		`@@ -0,0 +1,3 @@`
							`from .types import Timestamp, Namespace`

							`__version__ = "0.4.18"`