Initial commit

p# new file: runwikiq.sh
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/init.py
@@ -0,0 +1,14 @@
+"""
+A package with utilities for managing the persistent word analysis across text
+versions of a document.  `PersistenceState` is the highest level of the
+interface and the part of the system that's most interesting externally.  `Word`s
+are also very important.  The current implementation of `Word` only accounts for
+how the number of revisions in which a Word is visible.  If persistent word
+views (or something similar) is intended to be kept, refactoring will be
+necessary.
+"""
+
+from .state import State
+from .tokens import Tokens, Token
+from . import defaults
+from . import api
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/api.py
@@ -0,0 +1,85 @@
+from .. import reverts
+from ...util import none_or
+from .state import State
+
+
+def track(session, rev_id, page_id=None, revert_radius=reverts.defaults.RADIUS,
+          future_revisions=reverts.defaults.RADIUS, properties=None):
+    """
+    Computes a persistence score for a revision by processing the revisions
+    that took place around it.
+
+    :Parameters:
+        session : :class:`mw.api.Session`
+            An API session to make use of
+        rev_id : int
+            the ID of the revision to check
+        page_id : int
+            the ID of the page the revision occupies (slower if not provided)
+        revert_radius : int
+            a positive integer indicating the maximum number of revisions that can be reverted
+    """
+
+    if not hasattr(session, "revisions"):
+        raise TypeError("session is wrong type.  Expected a mw.api.Session.")
+
+    rev_id = int(rev_id)
+    page_id = none_or(page_id, int)
+    revert_radius = int(revert_radius)
+    if revert_radius < 1:
+        raise TypeError("invalid radius.  Expected a positive integer.")
+    properties = set(properties) if properties is not None else set()
+
+
+    # If we don't have the page_id, we're going to need to look them up
+    if page_id is None:
+        rev = session.revisions.get(rev_id, properties={'ids'})
+        page_id = rev['page']['pageid']
+
+    # Load history and current rev
+    current_and_past_revs = list(session.revisions.query(
+        pageids={page_id},
+        limit=revert_radius + 1,
+        start_id=rev_id,
+        direction="older",
+        properties={'ids', 'timestamp', 'content', 'sha1'} | properties
+    ))
+
+    try:
+        # Extract current rev and reorder history
+        current_rev, past_revs = (
+            current_and_past_revs[0],  # Current rev is the first one returned
+            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
+        )
+    except IndexError:
+        # Only way to get here is if there isn't enough history.  Couldn't be
+        # reverted.  Just return None.
+        return None
+
+    # Load future revisions
+    future_revs = session.revisions.query(
+        pageids={page_id},
+        limit=future_revisions,
+        start_id=rev_id + 1, # Ensures that we skip the current revision
+        direction="newer",
+        properties={'ids', 'timestamp', 'content', 'sha1'} | properties
+    )
+
+    state = State(revert_radius=revert_radius)
+
+    # Process old revisions
+    for rev in past_revs:
+        state.process(rev.get('*', ""), rev, rev.get('sha1'))
+
+    # Process current revision
+    _, tokens_added, _ = state.process(current_rev.get('*'), current_rev,
+                                         current_rev.get('sha1'))
+
+    # Process new revisions
+    future_revs = list(future_revs)
+    for rev in future_revs:
+        state.process(rev.get('*', ""), rev, rev.get('sha1'))
+
+    return current_rev, tokens_added, future_revs
+
+score = track
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/defaults.py
@@ -0,0 +1,11 @@
+from . import tokenization, difference
+
+TOKENIZE = tokenization.wikitext_split
+"""
+The standard tokenizing function.
+"""
+
+DIFF = difference.sequence_matcher
+"""
+The standard diff function
+"""
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/difference.py
@@ -0,0 +1,49 @@
+from difflib import SequenceMatcher
+
+
+def sequence_matcher(old, new):
+    """
+    Generates a sequence of operations using :class:`difflib.SequenceMatcher`.
+
+    :Parameters:
+        old : list( `hashable` )
+            Old tokens
+        new : list( `hashable` )
+            New tokens
+
+    Returns:
+        Minimal operations needed to convert `old` to `new`
+    """
+    sm = SequenceMatcher(None, list(old), list(new))
+    return sm.get_opcodes()
+
+
+def apply(ops, old, new):
+    """
+    Applies operations (delta) to copy items from `old` to `new`.
+
+    :Parameters:
+        ops : list((op, a1, a2, b1, b2))
+            Operations to perform
+        old : list( `hashable` )
+            Old tokens
+        new : list( `hashable` )
+            New tokens
+    :Returns:
+        An iterator over elements matching `new` but copied from `old`
+    """
+    for code, a_start, a_end, b_start, b_end in ops:
+        if code == "insert":
+            for t in new[b_start:b_end]:
+                yield t
+        elif code == "replace":
+            for t in new[b_start:b_end]:
+                yield t
+        elif code == "equal":
+            for t in old[a_start:a_end]:
+                yield t
+        elif code == "delete":
+            pass
+        else:
+            assert False, \
+                "encounted an unrecognized operation code: " + repr(code)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/state.py
@@ -0,0 +1,149 @@
+from hashlib import sha1
+
+from . import defaults
+from .. import reverts
+from .tokens import Token, Tokens
+
+
+class Version:
+    __slots__ = ('tokens')
+
+    def __init__(self):
+        self.tokens = None
+
+
+class State:
+    """
+    Represents the state of word persistence in a page.
+    See `<https://meta.wikimedia.org/wiki/Research:Content_persistence>`_
+
+    :Parameters:
+        tokenize : function( `str` ) --> list( `str` )
+            A tokenizing function
+        diff : function(list( `str` ), list( `str` )) --> list( `ops` )
+            A function to perform a difference between token lists
+        revert_radius : int
+            a positive integer indicating the maximum revision distance that a revert can span.
+        revert_detector : :class:`mw.lib.reverts.Detector`
+            a revert detector to start process with
+    :Example:
+        >>> from pprint import pprint
+        >>> from mw.lib import persistence
+        >>>
+        >>> state = persistence.State()
+        >>>
+        >>> pprint(state.process("Apples are red.", revision=1))
+        ([Token(text='Apples', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='are', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='red', revisions=[1]),
+          Token(text='.', revisions=[1])],
+         [Token(text='Apples', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='are', revisions=[1]),
+          Token(text=' ', revisions=[1]),
+          Token(text='red', revisions=[1]),
+          Token(text='.', revisions=[1])],
+         [])
+        >>> pprint(state.process("Apples are blue.", revision=2))
+        ([Token(text='Apples', revisions=[1, 2]),
+          Token(text=' ', revisions=[1, 2]),
+          Token(text='are', revisions=[1, 2]),
+          Token(text=' ', revisions=[1, 2]),
+          Token(text='blue', revisions=[2]),
+          Token(text='.', revisions=[1, 2])],
+         [Token(text='blue', revisions=[2])],
+         [Token(text='red', revisions=[1])])
+        >>> pprint(state.process("Apples are red.", revision=3)) # A revert!
+        ([Token(text='Apples', revisions=[1, 2, 3]),
+          Token(text=' ', revisions=[1, 2, 3]),
+          Token(text='are', revisions=[1, 2, 3]),
+          Token(text=' ', revisions=[1, 2, 3]),
+          Token(text='red', revisions=[1, 3]),
+          Token(text='.', revisions=[1, 2, 3])],
+         [],
+         [])
+    """
+
+    def __init__(self, tokenize=defaults.TOKENIZE, diff=defaults.DIFF,
+                 revert_radius=reverts.defaults.RADIUS,
+                 revert_detector=None):
+        self.tokenize = tokenize
+        self.diff = diff
+
+        # Either pass a detector or the revert radius so I can make one
+        if revert_detector is None:
+            self.revert_detector = reverts.Detector(int(revert_radius))
+        else:
+            self.revert_detector = revert_detector
+
+        # Stores the last tokens
+        self.last = None
+
+    def process(self, text, revision=None, checksum=None):
+        """
+        Modifies the internal state based a change to the content and returns
+        the sets of words added and removed.
+
+        :Parameters:
+            text : str
+                The text content of a revision
+            revision : `mixed`
+                Revision meta data
+            checksum : str
+                A checksum hash of the text content (will be generated if not provided)
+
+        :Returns:
+            Three :class:`~mw.lib.persistence.Tokens` lists
+
+            current_tokens : :class:`~mw.lib.persistence.Tokens`
+                A sequence of :class:`~mw.lib.persistence.Token` for the
+                processed revision
+            tokens_added : :class:`~mw.lib.persistence.Tokens`
+                A set of tokens that were inserted by the processed revision
+            tokens_removed : :class:`~mw.lib.persistence.Tokens`
+                A sequence of :class:`~mw.lib.persistence.Token` removed by the
+                processed revision
+
+        """
+        if checksum is None:
+            checksum = sha1(bytes(text, 'utf8')).hexdigest()
+
+        version = Version()
+
+        revert = self.revert_detector.process(checksum, version)
+        if revert is not None:  # Revert
+
+            # Empty words.
+            tokens_added = Tokens()
+            tokens_removed = Tokens()
+
+            # Extract reverted_to revision
+            _, _, reverted_to = revert
+            version.tokens = reverted_to.tokens
+
+        else:
+
+            if self.last is None:  # First version of the page!
+
+                version.tokens = Tokens(Token(t) for t in self.tokenize(text))
+                tokens_added = version.tokens
+                tokens_removed = Tokens()
+
+            else:
+
+                # NOTICE: HEAVY COMPUTATION HERE!!!
+                #
+                # OK.  It's not that heavy.  It's just performing a diff,
+                # but you're still going to spend most of your time here.
+                # Diffs usually run in O(n^2) -- O(n^3) time and most tokenizers
+                # produce a lot of tokens.
+                version.tokens, tokens_added, tokens_removed = \
+                    self.last.tokens.compare(self.tokenize(text), self.diff)
+
+        version.tokens.persist(revision)
+
+        self.last = version
+
+        return version.tokens, tokens_added, tokens_removed
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_difference.py
@@ -0,0 +1,12 @@
+from nose.tools import eq_
+
+from .. import difference
+
+
+def test_sequence_matcher():
+    t1 = "foobar derp hepl derpl"
+    t2 = "fooasldal 3 hepl asl a derpl"
+
+    ops = difference.sequence_matcher(t1, t2)
+
+    eq_("".join(difference.apply(ops, t1, t2)), t2)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_state.py
@@ -0,0 +1,25 @@
+from nose.tools import eq_
+
+from ..state import State
+
+
+def test_state():
+    contents_revisions = [
+        ("Apples are red.", 0),
+        ("Apples are blue.", 1),
+        ("Apples are red.", 2),
+        ("Apples are tasty and red.", 3),
+        ("Apples are tasty and blue.", 4)
+    ]
+
+    state = State()
+
+    token_sets = [state.process(c, r) for c, r in contents_revisions]
+
+    for i, (content, revision) in enumerate(contents_revisions):
+        eq_("".join(token_sets[i][0].texts()), content)
+
+    eq_(token_sets[0][0][0].text, "Apples")
+    eq_(len(token_sets[0][0][0].revisions), 5)
+    eq_(token_sets[0][0][4].text, "red")
+    eq_(len(token_sets[0][0][4].revisions), 3)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokenization.py
@@ -0,0 +1,10 @@
+from nose.tools import eq_
+
+from .. import tokenization
+
+
+def test_wikitext_split():
+    eq_(
+        list(tokenization.wikitext_split("foo bar herp {{derp}}")),
+        ["foo", " ", "bar", " ", "herp", " ", "{{", "derp", "}}"]
+    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tests/test_tokens.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokenization.py
@@ -0,0 +1,16 @@
+import re
+
+
+def wikitext_split(text):
+    """
+    Performs the simplest possible split of latin character-based languages
+    and wikitext.
+
+    :Parameters:
+        text : str
+            Text to split.
+    """
+    return re.findall(
+        r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
+        text
+    )
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/persistence/tokens.py
@@ -0,0 +1,98 @@
+class Token:
+    """
+    Represents a chunk of text and the revisions of a page that it survived.
+    """
+    __slots__ = ('text', 'revisions')
+
+    def __init__(self, text, revisions=None):
+        self.text = text
+        """
+        The text of the token.
+        """
+
+        self.revisions = revisions if revisions is not None else []
+        """
+        The meta data for the revisions that the token has appeared within.
+        """
+
+    def persist(self, revision):
+        self.revisions.append(revision)
+
+    def __repr__(self):
+        return "{0}({1})".format(
+            self.__class__.__name__,
+            ", ".join([
+                "text={0}".format(repr(self.text)),
+                "revisions={0}".format(repr(self.revisions))
+            ])
+        )
+
+
+class Tokens(list):
+    """
+    Represents a :class:`list` of :class:`~mw.lib.persistence.Token` with some
+    useful helper functions.
+
+    :Example:
+
+        >>> from mw.lib.persistence import Token, Tokens
+        >>>
+        >>> tokens = Tokens()
+        >>> tokens.append(Token("foo"))
+        >>> tokens.extend([Token(" "), Token("bar")])
+        >>>
+        >>> tokens[0]
+        Token(text='foo', revisions=[])
+        >>>
+        >>> "".join(tokens.texts())
+        'foo bar'
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def persist(self, revision):
+        for token in self:
+            token.persist(revision)
+
+    def texts(self):
+        for token in self:
+            yield token.text
+
+    def compare(self, new, diff):
+        old = self.texts()
+
+        return self.apply_diff(diff(old, new), self, new)
+
+    @classmethod
+    def apply_diff(cls, ops, old, new):
+
+        tokens = cls()
+        tokens_added = cls()
+        tokens_removed = cls()
+
+        for code, a_start, a_end, b_start, b_end in ops:
+            if code == "insert":
+                for token_text in new[b_start:b_end]:
+                    token = Token(token_text)
+                    tokens.append(token)
+                    tokens_added.append(token)
+
+            elif code == "replace":
+                for token_text in new[b_start:b_end]:
+                    token = Token(token_text)
+                    tokens.append(token)
+                    tokens_added.append(token)
+
+                tokens_removed.extend(t for t in old[a_start:a_end])
+
+            elif code == "equal":
+                tokens.extend(old[a_start:a_end])
+            elif code == "delete":
+                tokens_removed.extend(old[a_start:a_end])
+
+            else:
+                assert False, \
+                    "encounted an unrecognized operation code: " + repr(code)
+
+        return (tokens, tokens_added, tokens_removed)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/init.py
@@ -0,0 +1,24 @@
+"""
+This module provides a set of utilities for detecting identity reverts in
+revisioned content.
+
+To detect reverts in a stream of revisions to a single page, you can use
+:func:`detect`.  If you'll be detecting reverts in a collection of pages or
+would, for some other reason, prefer to process revisions one at a time,
+:class:`Detector` and it's :meth:`~Detector.process` will allow you to do so.
+
+To detect reverts one-at-time and arbitrarily, you can user the `check()`
+functions:
+
+* :func:`database.check` and :func:`database.check_row` use a :class:`mw.database.DB`
+* :func:`api.check` and :func:`api.check_rev` use a :class:`mw.api.Session`
+
+Note that these functions are less performant than detecting reverts in a
+stream of page revisions.  This can be practical when trying to identify
+reverted revisions in a user's contribution history.
+"""
+from .detector import Detector, Revert
+from .functions import detect, reverts
+from . import database
+from . import api
+from . import defaults
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py
@@ -0,0 +1,134 @@
+from itertools import chain
+
+from . import defaults
+from ...types import Timestamp
+from ...util import none_or
+from .dummy_checksum import DummyChecksum
+from .functions import detect
+
+
+def check_rev(session, rev, **kwargs):
+    """
+    Checks whether a revision (database row) was reverted (identity) and returns
+    a named tuple of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        session : :class:`mw.api.Session`
+            An API session to make use of
+        rev : dict
+            a revision dict containing 'revid' and 'page.id'
+        radius : int
+            a positive integer indicating the maximum number of revisions that can be reverted
+        before : :class:`mw.Timestamp`
+            if set, limits the search for *reverting* revisions to those which were saved before this timestamp
+        properties : set( str )
+            a set of properties to include in revisions (see :class:`mw.api.Revisions`)
+    """
+
+    # extract rev_id, sha1, page_id
+    if 'revid' in rev:
+        rev_id = rev['revid']
+    else:
+        raise TypeError("rev must have 'rev_id'")
+    if 'page' in rev:
+        page_id = rev['page']['id']
+    elif 'pageid' in rev:
+        page_id = rev['pageid']
+    else:
+        raise TypeError("rev must have 'page' or 'pageid'")
+
+    # run the regular check
+    return check(session, rev_id, page_id=page_id, **kwargs)
+
+
+def check(session, rev_id, page_id=None, radius=defaults.RADIUS,
+          before=None, window=None, properties=None):
+    """
+    Checks whether a revision was reverted (identity) and returns a named tuple
+    of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        session : :class:`mw.api.Session`
+            An API session to make use of
+        rev_id : int
+            the ID of the revision to check
+        page_id : int
+            the ID of the page the revision occupies (slower if not provided)
+        radius : int
+            a positive integer indicating the maximum number of revisions
+            that can be reverted
+        before : :class:`mw.Timestamp`
+            if set, limits the search for *reverting* revisions to those which
+            were saved before this timestamp
+        window : int
+            if set, limits the search for *reverting* revisions to those which
+            were saved within `window` seconds after the reverted edit
+        properties : set( str )
+            a set of properties to include in revisions (see :class:`mw.api.Revisions`)
+    """
+
+    if not hasattr(session, "revisions"):
+        raise TypeError("session wrong type.  Expected a mw.api.Session.")
+
+    rev_id = int(rev_id)
+    radius = int(radius)
+    if radius < 1:
+        raise TypeError("invalid radius.  Expected a positive integer.")
+
+    page_id = none_or(page_id, int)
+    before = none_or(before, Timestamp)
+    properties = set(properties) if properties is not None else set()
+
+    # If we don't have the page_id, we're going to need to look them up
+    if page_id is None:
+        rev = session.revisions.get(rev_id, properties={'ids'})
+        page_id = rev['page']['pageid']
+
+    # Load history and current rev
+    current_and_past_revs = list(session.revisions.query(
+        pageids={page_id},
+        limit=radius + 1,
+        start_id=rev_id,
+        direction="older",
+        properties={'ids', 'timestamp', 'sha1'} | properties
+    ))
+
+    try:
+        # Extract current rev and reorder history
+        current_rev, past_revs = (
+            current_and_past_revs[0],  # Current rev is the first one returned
+            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
+        )
+    except IndexError:
+        # Only way to get here is if there isn't enough history.  Couldn't be
+        # reverted.  Just return None.
+        return None
+
+    if window is not None and before is None:
+        before = Timestamp(current_rev['timestamp']) + window
+
+    # Load future revisions
+    future_revs = session.revisions.query(
+        pageids={page_id},
+        limit=radius,
+        start_id=rev_id + 1, # Ensures that we skip the current revision
+        end=before,
+        direction="newer",
+        properties={'ids', 'timestamp', 'sha1'} | properties
+    )
+
+    # Convert to an iterable of (checksum, rev) pairs for detect() to consume
+    checksum_revisions = chain(
+        ((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
+         for rev in past_revs),
+        [(current_rev.get('sha1', DummyChecksum()), current_rev)],
+        ((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
+         for rev in future_revs),
+    )
+
+    for revert in detect(checksum_revisions, radius=radius):
+        # Check that this is a relevant revert
+        if rev_id in [rev['revid'] for rev in revert.reverteds]:
+            return revert
+
+    return None
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/database.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/database.py
@@ -0,0 +1,148 @@
+import random
+from itertools import chain
+
+from . import defaults
+from ...types import Timestamp
+from ...util import none_or
+from .dummy_checksum import DummyChecksum
+from .functions import detect
+
+HEX = "1234567890abcdef"
+
+def random_sha1():
+    return ''.join(random.choice(HEX) for i in range(40))
+
+"""
+Simple constant used in order to not do weird things with a dummy revision.
+"""
+
+
+def check_row(db, rev_row, **kwargs):
+    """
+    Checks whether a revision (database row) was reverted (identity) and returns
+    a named tuple of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        db : :class:`mw.database.DB`
+            A database connection to make use of.
+        rev_row : dict
+            a revision row containing 'rev_id' and 'rev_page' or 'page_id'
+        radius : int
+            a positive integer indicating the the maximum number of revisions that can be reverted
+        check_archive : bool
+            should the archive table be checked for reverting revisions?
+        before : `Timestamp`
+            if set, limits the search for *reverting* revisions to those which were saved before this timestamp
+    """
+
+    # extract rev_id, sha1, page_id
+    if 'rev_id' in rev_row:
+        rev_id = rev_row['rev_id']
+    else:
+        raise TypeError("rev_row must have 'rev_id'")
+    if 'page_id' in rev_row:
+        page_id = rev_row['page_id']
+    elif 'rev_page' in rev_row:
+        page_id = rev_row['rev_page']
+    else:
+        raise TypeError("rev_row must have 'page_id' or 'rev_page'")
+
+    # run the regular check
+    return check(db, rev_id, page_id=page_id, **kwargs)
+
+
+def check(db, rev_id, page_id=None, radius=defaults.RADIUS, check_archive=False,
+          before=None, window=None):
+
+    """
+    Checks whether a revision was reverted (identity) and returns a named tuple
+    of Revert(reverting, reverteds, reverted_to).
+
+    :Parameters:
+        db : `mw.database.DB`
+            A database connection to make use of.
+        rev_id : int
+            the ID of the revision to check
+        page_id : int
+            the ID of the page the revision occupies (slower if not provided)
+        radius : int
+            a positive integer indicating the maximum number of revisions that can be reverted
+        check_archive : bool
+            should the archive table be checked for reverting revisions?
+        before : `Timestamp`
+            if set, limits the search for *reverting* revisions to those which were saved before this timestamp
+        window : int
+            if set, limits the search for *reverting* revisions to those which
+            were saved within `window` seconds after the reverted edit
+    """
+
+    if not hasattr(db, "revisions") and hasattr(db, "all_revisions"):
+        raise TypeError("db wrong type.  Expected a mw.database.DB.")
+
+    rev_id = int(rev_id)
+    radius = int(radius)
+    if radius < 1:
+        raise TypeError("invalid radius.  Expected a positive integer.")
+    page_id = none_or(page_id, int)
+    check_archive = bool(check_archive)
+    before = none_or(before, Timestamp)
+
+    # If we are searching the archive, we'll need to use `all_revisions`.
+    if check_archive:
+        dbrevs = db.all_revisions
+    else:
+        dbrevs = db.revisions
+
+    # If we don't have the sha1 or page_id, we're going to need to look them up
+    if page_id is None:
+        row = dbrevs.get(rev_id=rev_id)
+        page_id = row['rev_page']
+
+    # Load history and current rev
+    current_and_past_revs = list(dbrevs.query(
+        page_id=page_id,
+        limit=radius + 1,
+        before_id=rev_id + 1,  # Ensures that we capture the current revision
+        direction="older"
+    ))
+
+    try:
+        # Extract current rev and reorder history
+        current_rev, past_revs = (
+            current_and_past_revs[0],  # Current rev is the first one returned
+            reversed(current_and_past_revs[1:])  # The rest are past revs, but they are in the wrong order
+        )
+    except IndexError:
+        # Only way to get here is if there isn't enough history.  Couldn't be
+        # reverted.  Just return None.
+        return None
+
+    if window is not None and before is None:
+        before = Timestamp(current_rev['rev_timestamp']) + window
+
+    # Load future revisions
+    future_revs = dbrevs.query(
+        page_id=page_id,
+        limit=radius,
+        after_id=rev_id,
+        before=before,
+        direction="newer"
+    )
+
+    # Convert to an iterable of (checksum, rev) pairs for detect() to consume
+    checksum_revisions = chain(
+        ((rev['rev_sha1'] if rev['rev_sha1'] is not None \
+          else DummyChecksum(), rev)
+         for rev in past_revs),
+        [(current_rev['rev_sha1'] or DummyChecksum(), current_rev)],
+        ((rev['rev_sha1'] if rev['rev_sha1'] is not None \
+          else DummyChecksum(), rev)
+         for rev in future_revs)
+    )
+
+    for revert in detect(checksum_revisions, radius=radius):
+        # Check that this is a relevant revert
+        if rev_id in [rev['rev_id'] for rev in revert.reverteds]:
+            return revert
+
+    return None
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/defaults.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/defaults.py
@@ -0,0 +1,24 @@
+RADIUS = 15
+"""
+TODO: Better documentation here.  For the time being, see:
+
+Priedhorsky, R., Chen, J., Lam, S. T. K., Panciera, K., Terveen, L., &
+Riedl, J. (2007, November). Creating, destroying, and restoring value in
+Wikipedia. In Proceedings of the 2007 international ACM conference on
+Supporting group work (pp. 259-268). ACM.
+"""
+
+
+class DUMMY_SHA1: pass
+"""
+Used in when checking for reverts when the checksum of the revision of interest
+is unknown.
+
+>>> DUMMY_SHA1 in {"aaa", "bbb"} # or any 40 character hex
+False
+>>>
+>>> DUMMY_SHA1 == DUMMY_SHA1
+True
+>>> {DUMMY_SHA1, DUMMY_SHA1}
+{<class '__main__.DUMMY_SHA1'>}
+"""
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/detector.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/detector.py
@@ -0,0 +1,83 @@
+from collections import namedtuple
+
+from ...util import ordered
+from . import defaults
+
+Revert = namedtuple("Revert", ['reverting', 'reverteds', 'reverted_to'])
+"""
+Represents a revert event.  This class behaves like
+:class:`collections.namedtuple`.  Note that the datatypes of `reverting`,
+`reverteds` and `reverted_to` is not specified since those types will depend
+on the revision data provided during revert detection.
+
+:Members:
+    **reverting**
+        The reverting revision data : `mixed`
+    **reverteds**
+        The reverted revision data (ordered chronologically) : list( `mixed` )
+    **reverted_to**
+        The reverted-to revision data : `mixed`
+"""
+
+
+class Detector(ordered.HistoricalMap):
+    """
+    Detects revert events in a stream of revisions (to the same page) based on
+    matching checksums.  To detect reverts, construct an instance of this class and call
+    :meth:`process` in chronological order (``direction == "newer"``).
+
+    See `<https://meta.wikimedia.org/wiki/R:Identity_revert>`_
+
+    :Parameters:
+        radius : int
+            a positive integer indicating the maximum revision distance that a revert can span.
+
+    :Example:
+        >>> from mw.lib import reverts
+        >>> detector = reverts.Detector()
+        >>>
+        >>> detector.process("aaa", {'rev_id': 1})
+        >>> detector.process("bbb", {'rev_id': 2})
+        >>> detector.process("aaa", {'rev_id': 3})
+        Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})
+        >>> detector.process("ccc", {'rev_id': 4})
+
+    """
+
+    def __init__(self, radius=defaults.RADIUS):
+        """
+        :Parameters:
+            radius : int
+                a positive integer indicating the maximum revision distance that a revert can span.
+        """
+        if radius < 1:
+            raise TypeError("invalid radius. Expected a positive integer.")
+        super().__init__(maxlen=radius + 1)
+
+    def process(self, checksum, revision=None):
+        """
+        Process a new revision and detect a revert if it occurred.  Note that
+        you can pass whatever you like as `revision` and it will be returned in
+        the case that a revert occurs.
+
+        :Parameters:
+            checksum : str
+                Any identity-machable string-based hash of revision content
+            revision : `mixed`
+                Revision meta data.  Note that any data will just be returned in the
+                case of a revert.
+
+        :Returns:
+            a :class:`~mw.lib.reverts.Revert` if one occured or `None`
+        """
+        revert = None
+
+        if checksum in self:  # potential revert
+
+            reverteds = list(self.up_to(checksum))
+
+            if len(reverteds) > 0:  # If no reverted revisions, this is a noop
+                revert = Revert(revision, reverteds, self[checksum])
+
+        self.insert(checksum, revision)
+        return revert
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/dummy_checksum.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/dummy_checksum.py
@@ -0,0 +1,24 @@
+class DummyChecksum():
+    """
+    Used in when checking for reverts when the checksum of the revision of interest
+    is unknown.  DummyChecksums won't match eachother or anything else, but they
+    will match themselves and they are hashable.
+
+    >>> dummy1 = DummyChecksum()
+    >>> dummy1
+    <#140687347334280>
+    >>> dummy1 == dummy1
+    True
+    >>>
+    >>> dummy2 = DummyChecksum()
+    >>> dummy2
+    <#140687347334504>
+    >>> dummy1 == dummy2
+    False
+    >>>
+    >>> {"foo", "bar", dummy1, dummy1, dummy2}
+    {<#140687347334280>, 'foo', <#140687347334504>, 'bar'}
+    """
+    
+    def __str__(self): repr(self)
+    def __repr__(self): return "<#" + str(id(self)) + ">"
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/functions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/functions.py
@@ -0,0 +1,46 @@
+from .detector import Detector
+from . import defaults
+
+
+def detect(checksum_revisions, radius=defaults.RADIUS):
+    """
+    Detects reverts that occur in a sequence of revisions.  Note that,
+    `revision` data meta will simply be returned in the case of a revert.
+
+    This function serves as a convenience wrapper around calls to
+    :class:`Detector`'s :meth:`~Detector.process`
+    method.
+
+    :Parameters:
+        checksum_revisions : iter( ( checksum : str, revision : `mixed` ) )
+            an iterable over tuples of checksum and revision meta data
+        radius : int
+            a positive integer indicating the maximum revision distance that a revert can span.
+
+    :Return:
+        a iterator over :class:`Revert`
+
+    :Example:
+        >>> from mw.lib import reverts
+        >>>
+        >>> checksum_revisions = [
+        ...     ("aaa", {'rev_id': 1}),
+        ...     ("bbb", {'rev_id': 2}),
+        ...     ("aaa", {'rev_id': 3}),
+        ...     ("ccc", {'rev_id': 4})
+        ... ]
+        >>>
+        >>> list(reverts.detect(checksum_revisions))
+        [Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})]
+
+    """
+
+    revert_detector = Detector(radius)
+
+    for checksum, revision in checksum_revisions:
+        revert = revert_detector.process(checksum, revision)
+        if revert is not None:
+            yield revert
+
+# For backwards compatibility
+reverts = detect
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_detector.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_detector.py
@@ -0,0 +1,33 @@
+from nose.tools import eq_
+
+from ..detector import Detector
+
+
+def test_detector():
+    detector = Detector(2)
+
+    eq_(detector.process("a", {'id': 1}), None)
+
+    # Check noop
+    eq_(detector.process("a", {'id': 2}), None)
+
+    # Short revert
+    eq_(detector.process("b", {'id': 3}), None)
+    eq_(
+        detector.process("a", {'id': 4}),
+        ({'id': 4}, [{'id': 3}], {'id': 2})
+    )
+
+    # Medium revert
+    eq_(detector.process("c", {'id': 5}), None)
+    eq_(detector.process("d", {'id': 6}), None)
+    eq_(
+        detector.process("a", {'id': 7}),
+        ({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
+    )
+
+    # Long (undetected) revert
+    eq_(detector.process("e", {'id': 8}), None)
+    eq_(detector.process("f", {'id': 9}), None)
+    eq_(detector.process("g", {'id': 10}), None)
+    eq_(detector.process("a", {'id': 11}), None)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_functions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/tests/test_functions.py
@@ -0,0 +1,23 @@
+from nose.tools import eq_
+
+from ..functions import reverts
+
+
+def test_reverts():
+    checksum_revisions = [
+        ("a", {'id': 1}),
+        ("b", {'id': 2}),
+        ("c", {'id': 3}),
+        ("a", {'id': 4}),
+        ("d", {'id': 5}),
+        ("b", {'id': 6}),
+        ("a", {'id': 7})
+    ]
+
+    expected = [
+        ({'id': 4}, [{'id': 3}, {'id': 2}], {'id': 1}),
+        ({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
+    ]
+
+    for revert in reverts(checksum_revisions, radius=2):
+        eq_(revert, expected.pop(0))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/init.py
@@ -0,0 +1,4 @@
+from .functions import cluster, sessions
+from .event import Event
+from .cache import Cache, Session
+from . import defaults
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/cache.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/cache.py
@@ -0,0 +1,121 @@
+import logging
+from collections import namedtuple
+
+from ...util import Heap
+from ...types import Timestamp
+from . import defaults
+from .event import Event, unpack_events
+
+
+logger = logging.getLogger("mw.lib.sessions.cache")
+
+Session = namedtuple("Session", ["user", "events"])
+"""
+Represents a user session (a cluster over events for a user).  This class
+behaves like :class:`collections.namedtuple`.  Note that the datatypes of
+`events`, is not specified since those types will depend on the revision data
+provided during revert detection.
+
+:Members:
+    **user**
+        A hashable user identifier : `hashable`
+    **events**
+        A list of event data : list( `mixed` )
+"""
+
+
+class Cache:
+    """
+    A cache of recent user session.  Since sessions expire once activities stop
+    for at least `cutoff` seconds, this class manages a cache of *active*
+    sessions.
+
+    :Parameters:
+        cutoff : int
+            Maximum amount of time in seconds between session events
+
+    :Example:
+        >>> from mw.lib import sessions
+        >>>
+        >>> cache = sessions.Cache(cutoff=3600)
+        >>>
+        >>> list(cache.process("Willy on wheels", 100000, {'rev_id': 1}))
+        []
+        >>> list(cache.process("Walter", 100001, {'rev_id': 2}))
+        []
+        >>> list(cache.process("Willy on wheels", 100001, {'rev_id': 3}))
+        []
+        >>> list(cache.process("Walter", 100035, {'rev_id': 4}))
+        []
+        >>> list(cache.process("Willy on wheels", 103602, {'rev_id': 5}))
+        [Session(user='Willy on wheels', events=[{'rev_id': 1}, {'rev_id': 3}])]
+        >>> list(cache.get_active_sessions())
+        [Session(user='Walter', events=[{'rev_id': 2}, {'rev_id': 4}]), Session(user='Willy on wheels', events=[{'rev_id': 5}])]
+
+
+    """
+
+    def __init__(self, cutoff=defaults.CUTOFF):
+        self.cutoff = int(cutoff)
+
+        self.recently_active = Heap()
+        self.active_users = {}
+
+    def process(self, user, timestamp, data=None):
+        """
+        Processes a user event.
+
+        :Parameters:
+            user : `hashable`
+                A hashable value to identify a user (`int` or `str` are OK)
+            timestamp : :class:`mw.Timestamp`
+                The timestamp of the event
+            data : `mixed`
+                Event meta data
+
+        :Returns:
+            A generator of :class:`~mw.lib.sessions.Session` expired after
+            processing the user event.
+        """
+        event = Event(user, Timestamp(timestamp), data)
+
+        for user, events in self._clear_expired(event.timestamp):
+            yield Session(user, unpack_events(events))
+
+        # Apply revision
+        if event.user in self.active_users:
+            events = self.active_users[event.user]
+        else:
+            events = []
+            self.active_users[event.user] = events
+            self.recently_active.push((event.timestamp, events))
+
+        events.append(event)
+
+    def get_active_sessions(self):
+        """
+        Retrieves the active, unexpired sessions.
+
+        :Returns:
+            A generator of :class:`~mw.lib.sessions.Session`
+
+        """
+        for last_timestamp, events in self.recently_active:
+            yield Session(events[-1].user, unpack_events(events))
+
+    def _clear_expired(self, timestamp):
+
+        # Cull old sessions
+        while (len(self.recently_active) > 0 and
+               timestamp - self.recently_active.peek()[0] >= self.cutoff):
+
+            _, events = self.recently_active.pop()
+
+            if timestamp - events[-1].timestamp >= self.cutoff:
+                del self.active_users[events[-1].user]
+                yield events[-1].user, events
+            else:
+                self.recently_active.push((events[-1].timestamp, events))
+
+    def __repr__(self):
+        return "%s(%s)".format(self.__class__.__name__, repr(self.cutoff))
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/defaults.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/defaults.py
@@ -0,0 +1,6 @@
+CUTOFF = 60 * 60
+"""
+TODO: Better documentation here.
+For the time being, see
+`<https://meta.wikimedia.org/wiki/Research:Edit_session>`_
+"""
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/event.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/event.py
@@ -0,0 +1,19 @@
+import logging
+from collections import namedtuple
+
+logger = logging.getLogger("mw.lib.sessions.event")
+
+
+# class Event:
+#   __slots__ = ('user', 'timestamp', 'data')
+#
+#   def __init__(self, user, timestamp, data=None):
+#       self.user = user
+#       self.timestamp = Timestamp(timestamp)
+#       self.data = data
+
+Event = namedtuple("Event", ['user', 'timestamp', 'data'])
+
+
+def unpack_events(events):
+    return list(e.data for e in events)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/functions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/functions.py
@@ -0,0 +1,68 @@
+import logging
+
+from .cache import Cache
+from . import defaults
+
+logger = logging.getLogger("mw.lib.sessions.functions")
+
+
+def cluster(user_events, cutoff=defaults.CUTOFF):
+    """
+    Clusters user sessions from a sequence of user events.  Note that,
+    `event` data will simply be returned in the case of a revert.
+
+    This function serves as a convenience wrapper around calls to
+    :class:`~mw.lib.sessions.Cache`'s :meth:`~mw.lib.sessions.Cache.process`
+    method.
+
+    :Parameters:
+        user_events : iter( (user, timestamp, event) )
+            an iterable over tuples of user, timestamp and event data.
+
+            * user : `hashable`
+            * timestamp : :class:`mw.Timestamp`
+            * event : `mixed`
+
+        cutoff : int
+            the maximum time between events within a user session
+
+    :Returns:
+        a iterator over :class:`~mw.lib.sessions.Session`
+
+    :Example:
+        >>> from mw.lib import sessions
+        >>>
+        >>> user_events = [
+        ...     ("Willy on wheels", 100000, {'rev_id': 1}),
+        ...     ("Walter", 100001, {'rev_id': 2}),
+        ...     ("Willy on wheels", 100001, {'rev_id': 3}),
+        ...     ("Walter", 100035, {'rev_id': 4}),
+        ...     ("Willy on wheels", 103602, {'rev_id': 5})
+        ... ]
+        >>>
+        >>> for user, events in sessions.cluster(user_events):
+        ...     (user, events)
+        ...
+        ('Willy on wheels', [{'rev_id': 1}, {'rev_id': 3}])
+        ('Walter', [{'rev_id': 2}, {'rev_id': 4}])
+        ('Willy on wheels', [{'rev_id': 5}])
+
+
+    """
+
+    # Construct the session manager
+    cache = Cache(cutoff)
+
+    # Apply the events
+    for user, timestamp, event in user_events:
+
+        for session in cache.process(user, timestamp, event):
+            yield session
+
+    # Yield the left-overs
+    for session in cache.get_active_sessions():
+        yield session
+
+
+# For backwards compatibility
+sessions = cluster
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_cache.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_cache.py
@@ -0,0 +1,22 @@
+from nose.tools import eq_
+
+from ..cache import Cache
+
+
+def test_session_manager():
+    cache = Cache(cutoff=2)
+
+    user_sessions = list(cache.process("foo", 1))
+    eq_(user_sessions, [])
+
+    user_sessions = list(cache.process("bar", 2))
+    eq_(user_sessions, [])
+
+    user_sessions = list(cache.process("foo", 2))
+    eq_(user_sessions, [])
+
+    user_sessions = list(cache.process("bar", 10))
+    eq_(len(user_sessions), 2)
+
+    user_sessions = list(cache.get_active_sessions())
+    eq_(len(user_sessions), 1)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_functions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/sessions/tests/test_functions.py
@@ -0,0 +1,50 @@
+from itertools import chain
+
+from nose.tools import eq_
+from .. import defaults
+from ..functions import sessions
+
+
+EVENTS = {
+    "foo": [
+        [
+            ("foo", 1234567890, 1),
+            ("foo", 1234567892, 2),
+            ("foo", 1234567894, 3)
+        ],
+        [
+            ("foo", 1234567894 + defaults.CUTOFF, 4),
+            ("foo", 1234567897 + defaults.CUTOFF, 5)
+        ]
+    ],
+    "bar": [
+        [
+            ("bar", 1234567891, 6),
+            ("bar", 1234567892, 7),
+            ("bar", 1234567893, 8)
+        ],
+        [
+            ("bar", 1234567895 + defaults.CUTOFF, 9),
+            ("bar", 1234567898 + defaults.CUTOFF, 0)
+        ]
+    ]
+}
+
+
+def test_group_events():
+    events = []
+    events.extend(chain(*EVENTS['foo']))
+    events.extend(chain(*EVENTS['bar']))
+
+    events.sort()
+
+    user_sessions = sessions(events)
+
+    counts = {
+        'foo': 0,
+        'bar': 0
+    }
+
+    for user, session in user_sessions:
+        eq_(list(e[2] for e in EVENTS[user][counts[user]]), list(session))
+        counts[user] += 1
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/init.py
@@ -0,0 +1,2 @@
+from .functions import normalize
+from .parser import Parser
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/functions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/functions.py
@@ -0,0 +1,25 @@
+def normalize(title):
+    """
+    Normalizes a page title to the database format.  E.g. spaces are converted
+    to underscores and the first character in the title is converted to
+    upper-case.
+
+    :Parameters:
+        title : str
+            A page title
+    :Returns:
+        The normalized title.
+    :Example:
+        >>> from mw.lib import title
+        >>>
+        >>> title.normalize("foo bar")
+        'Foo_bar'
+
+    """
+    if title is None:
+        return title
+    else:
+        if len(title) > 0:
+            return (title[0].upper() + title[1:]).replace(" ", "_")
+        else:
+            return ""
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py
@@ -0,0 +1,171 @@
+from ...types import Namespace
+from ...util import autovivifying, none_or
+from .functions import normalize
+
+
+class Parser:
+    """
+    Constructs a page name parser from a set of :class:`mw.Namespace`.  Such a
+    parser can be used to convert a full page name (namespace included with a
+    colon; e.g, ``"Talk:Foo"``) into a namespace ID and
+    :func:`mw.lib.title.normalize`'d page title (e.g., ``(1, "Foo")``).
+
+    :Parameters:
+        namespaces : set( :class:`mw.Namespace` )
+    :Example:
+        >>> from mw import Namespace
+        >>> from mw.lib import title
+        >>>
+        >>> parser = title.Parser(
+        ...     [
+        ...             Namespace(0, "", case="first-letter"),
+        ...             Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
+        ...             Namespace(2, "Usu\u00e1rio(a)", canonical="User", aliases={"U"}, case="first-letter")
+        ...     ]
+        ... )
+        >>>
+        >>> parser.parse("Discuss\u00e3o:Foo") # Using the standard name
+        (1, 'Foo')
+        >>> parser.parse("Talk:Foo bar") # Using the cannonical name
+        (1, 'Foo_bar')
+        >>> parser.parse("U:Foo bar") # Using an alias
+        (2, 'Foo_bar')
+        >>> parser.parse("Herpderp:Foo bar") # Psuedo namespace
+        (0, 'Herpderp:Foo_bar')
+    """
+
+    def __init__(self, namespaces=None):
+        namespaces = none_or(namespaces, set)
+
+        self.ids = {}
+        self.names = {}
+
+        if namespaces is not None:
+            for namespace in namespaces:
+                self.add_namespace(namespace)
+
+    def parse(self, page_name):
+        """
+        Parses a page name to extract the namespace.
+
+        :Parameters:
+            page_name : str
+                A page name including the namespace prefix and a colon (if not Main)
+
+        :Returns:
+            A tuple of (namespace : `int`, title : `str`)
+        """
+        parts = page_name.split(":", 1)
+        if len(parts) == 1:
+            ns_id = 0
+            title = normalize(page_name)
+        else:
+            ns_name, title = parts
+            ns_name, title = normalize(ns_name), normalize(title)
+
+            if self.contains_name(ns_name):
+                ns_id = self.get_namespace(name=ns_name).id
+            else:
+                ns_id = 0
+                title = normalize(page_name)
+
+        return ns_id, title
+
+    def add_namespace(self, namespace):
+        """
+        Adds a namespace to the parser.
+
+        :Parameters:
+            namespace : :class:`mw.Namespace`
+                A namespace
+        """
+        self.ids[namespace.id] = namespace
+        self.names[namespace.name] = namespace
+
+        for alias in namespace.aliases:
+            self.names[alias] = namespace
+
+        if namespace.canonical is not None:
+            self.names[namespace.canonical] = namespace
+
+    def contains_name(self, name):
+        return normalize(name) in self.names
+
+    def get_namespace(self, id=None, name=None):
+        """
+        Gets a namespace from the parser.  Throws a :class:`KeyError` if a
+        namespace cannot be found.
+
+        :Parameters:
+            id : int
+                A namespace ID
+            name : str
+                A namespace name (standard, cannonical names and aliases
+                will be searched)
+        :Returns:
+            A :class:`mw.Namespace`.
+        """
+        if id is not None:
+            return self.ids[int(id)]
+        else:
+            return self.names[normalize(name)]
+
+    @classmethod
+    def from_site_info(cls, si_doc):
+        """
+        Constructs a parser from the result of a :meth:`mw.api.SiteInfo.query`.
+
+        :Parameters:
+            si_doc : dict
+                The result of a site_info request.
+
+        :Returns:
+            An initialized :class:`mw.lib.title.Parser`
+        """
+        aliases = autovivifying.Dict(vivifier=lambda k: [])
+        # get aliases
+        if 'namespacealiases' in si_doc:
+            for alias_doc in si_doc['namespacealiases']:
+                aliases[alias_doc['id']].append(alias_doc['*'])
+
+        namespaces = []
+        for ns_doc in si_doc['namespaces'].values():
+            namespaces.append(
+                Namespace.from_doc(ns_doc, aliases)
+            )
+
+        return Parser(namespaces)
+
+    @classmethod
+    def from_api(cls, session):
+        """
+        Constructs a parser from a :class:`mw.api.Session`
+
+        :Parameters:
+            session : :class:`mw.api.Session`
+                An open API session
+
+        :Returns:
+            An initialized :class:`mw.lib.title.Parser`
+        """
+        si_doc = session.site_info.query(
+            properties={'namespaces', 'namespacealiases'}
+        )
+
+        return cls.from_site_info(si_doc)
+
+    @classmethod
+    def from_dump(cls, dump):
+        """
+        Constructs a parser from a :class:`mw.xml_dump.Iterator`.  Note that
+        XML database dumps do not include namespace aliases or cannonical names
+        so the parser that will be constructed will only work in common cases.
+
+        :Parameters:
+            dump : :class:`mw.xml_dump.Iterator`
+                An XML dump iterator
+
+        :Returns:
+            An initialized :class:`mw.lib.title.Parser`
+        """
+        return cls(dump.namespaces)
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/init.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/init.py
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_functions.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_functions.py
@@ -0,0 +1,10 @@
+from nose.tools import eq_
+
+from ..functions import normalize
+
+
+def test_normalize():
+    eq_("Foobar", normalize("Foobar"))  # Same
+    eq_("Foobar", normalize("foobar"))  # Capitalize
+    eq_("FooBar", normalize("fooBar"))  # Late capital
+    eq_("Foo_bar", normalize("Foo bar"))  # Space
--- a/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_parser.py
+++ b/mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/tests/test_parser.py
@@ -0,0 +1,58 @@
+from nose.tools import eq_
+
+from ....types import Namespace
+from ..parser import Parser
+
+
+def test_simple():
+    parser = Parser(
+        [
+            Namespace(0, "", case="first-letter"),
+            Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
+            Namespace(2, "Usu\u00e1rio(a)", canonical="User", case="first-letter")
+        ]
+    )
+
+    eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
+    eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
+    eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
+
+
+def test_from_site_info():
+    parser = Parser.from_site_info(
+        {
+            "namespaces": {
+                "0": {
+                    "id": 0,
+                    "case": "first-letter",
+                    "*": "",
+                    "content": ""
+                },
+                "1": {
+                    "id": 1,
+                    "case": "first-letter",
+                    "*": "Discuss\u00e3o",
+                    "subpages": "",
+                    "canonical": "Talk"
+                },
+                "2": {
+                    "id": 2,
+                    "case": "first-letter",
+                    "*": "Usu\u00e1rio(a)",
+                    "subpages": "",
+                    "canonical": "User"
+                }
+            },
+            "namespacealiases": [
+                {
+                    "id": 1,
+                    "*": "WAFFLES"
+                }
+            ]
+        }
+    )
+
+    eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
+    eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
+    eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
+    eq_((1, "Foo_bar"), parser.parse("WAFFLES:Foo bar"))