1
0

Initial commit

p#	new file:   runwikiq.sh
This commit is contained in:
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
"""
A package with utilities for managing the persistent word analysis across text
versions of a document. `PersistenceState` is the highest level of the
interface and the part of the system that's most interesting externally. `Word`s
are also very important. The current implementation of `Word` only accounts for
how the number of revisions in which a Word is visible. If persistent word
views (or something similar) is intended to be kept, refactoring will be
necessary.
"""
from .state import State
from .tokens import Tokens, Token
from . import defaults
from . import api

View File

@@ -0,0 +1,85 @@
from .. import reverts
from ...util import none_or
from .state import State
def track(session, rev_id, page_id=None, revert_radius=reverts.defaults.RADIUS,
future_revisions=reverts.defaults.RADIUS, properties=None):
"""
Computes a persistence score for a revision by processing the revisions
that took place around it.
:Parameters:
session : :class:`mw.api.Session`
An API session to make use of
rev_id : int
the ID of the revision to check
page_id : int
the ID of the page the revision occupies (slower if not provided)
revert_radius : int
a positive integer indicating the maximum number of revisions that can be reverted
"""
if not hasattr(session, "revisions"):
raise TypeError("session is wrong type. Expected a mw.api.Session.")
rev_id = int(rev_id)
page_id = none_or(page_id, int)
revert_radius = int(revert_radius)
if revert_radius < 1:
raise TypeError("invalid radius. Expected a positive integer.")
properties = set(properties) if properties is not None else set()
# If we don't have the page_id, we're going to need to look them up
if page_id is None:
rev = session.revisions.get(rev_id, properties={'ids'})
page_id = rev['page']['pageid']
# Load history and current rev
current_and_past_revs = list(session.revisions.query(
pageids={page_id},
limit=revert_radius + 1,
start_id=rev_id,
direction="older",
properties={'ids', 'timestamp', 'content', 'sha1'} | properties
))
try:
# Extract current rev and reorder history
current_rev, past_revs = (
current_and_past_revs[0], # Current rev is the first one returned
reversed(current_and_past_revs[1:]) # The rest are past revs, but they are in the wrong order
)
except IndexError:
# Only way to get here is if there isn't enough history. Couldn't be
# reverted. Just return None.
return None
# Load future revisions
future_revs = session.revisions.query(
pageids={page_id},
limit=future_revisions,
start_id=rev_id + 1, # Ensures that we skip the current revision
direction="newer",
properties={'ids', 'timestamp', 'content', 'sha1'} | properties
)
state = State(revert_radius=revert_radius)
# Process old revisions
for rev in past_revs:
state.process(rev.get('*', ""), rev, rev.get('sha1'))
# Process current revision
_, tokens_added, _ = state.process(current_rev.get('*'), current_rev,
current_rev.get('sha1'))
# Process new revisions
future_revs = list(future_revs)
for rev in future_revs:
state.process(rev.get('*', ""), rev, rev.get('sha1'))
return current_rev, tokens_added, future_revs
score = track

View File

@@ -0,0 +1,11 @@
from . import tokenization, difference
TOKENIZE = tokenization.wikitext_split
"""
The standard tokenizing function.
"""
DIFF = difference.sequence_matcher
"""
The standard diff function
"""

View File

@@ -0,0 +1,49 @@
from difflib import SequenceMatcher
def sequence_matcher(old, new):
"""
Generates a sequence of operations using :class:`difflib.SequenceMatcher`.
:Parameters:
old : list( `hashable` )
Old tokens
new : list( `hashable` )
New tokens
Returns:
Minimal operations needed to convert `old` to `new`
"""
sm = SequenceMatcher(None, list(old), list(new))
return sm.get_opcodes()
def apply(ops, old, new):
"""
Applies operations (delta) to copy items from `old` to `new`.
:Parameters:
ops : list((op, a1, a2, b1, b2))
Operations to perform
old : list( `hashable` )
Old tokens
new : list( `hashable` )
New tokens
:Returns:
An iterator over elements matching `new` but copied from `old`
"""
for code, a_start, a_end, b_start, b_end in ops:
if code == "insert":
for t in new[b_start:b_end]:
yield t
elif code == "replace":
for t in new[b_start:b_end]:
yield t
elif code == "equal":
for t in old[a_start:a_end]:
yield t
elif code == "delete":
pass
else:
assert False, \
"encounted an unrecognized operation code: " + repr(code)

View File

@@ -0,0 +1,149 @@
from hashlib import sha1
from . import defaults
from .. import reverts
from .tokens import Token, Tokens
class Version:
__slots__ = ('tokens')
def __init__(self):
self.tokens = None
class State:
"""
Represents the state of word persistence in a page.
See `<https://meta.wikimedia.org/wiki/Research:Content_persistence>`_
:Parameters:
tokenize : function( `str` ) --> list( `str` )
A tokenizing function
diff : function(list( `str` ), list( `str` )) --> list( `ops` )
A function to perform a difference between token lists
revert_radius : int
a positive integer indicating the maximum revision distance that a revert can span.
revert_detector : :class:`mw.lib.reverts.Detector`
a revert detector to start process with
:Example:
>>> from pprint import pprint
>>> from mw.lib import persistence
>>>
>>> state = persistence.State()
>>>
>>> pprint(state.process("Apples are red.", revision=1))
([Token(text='Apples', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='are', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='red', revisions=[1]),
Token(text='.', revisions=[1])],
[Token(text='Apples', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='are', revisions=[1]),
Token(text=' ', revisions=[1]),
Token(text='red', revisions=[1]),
Token(text='.', revisions=[1])],
[])
>>> pprint(state.process("Apples are blue.", revision=2))
([Token(text='Apples', revisions=[1, 2]),
Token(text=' ', revisions=[1, 2]),
Token(text='are', revisions=[1, 2]),
Token(text=' ', revisions=[1, 2]),
Token(text='blue', revisions=[2]),
Token(text='.', revisions=[1, 2])],
[Token(text='blue', revisions=[2])],
[Token(text='red', revisions=[1])])
>>> pprint(state.process("Apples are red.", revision=3)) # A revert!
([Token(text='Apples', revisions=[1, 2, 3]),
Token(text=' ', revisions=[1, 2, 3]),
Token(text='are', revisions=[1, 2, 3]),
Token(text=' ', revisions=[1, 2, 3]),
Token(text='red', revisions=[1, 3]),
Token(text='.', revisions=[1, 2, 3])],
[],
[])
"""
def __init__(self, tokenize=defaults.TOKENIZE, diff=defaults.DIFF,
revert_radius=reverts.defaults.RADIUS,
revert_detector=None):
self.tokenize = tokenize
self.diff = diff
# Either pass a detector or the revert radius so I can make one
if revert_detector is None:
self.revert_detector = reverts.Detector(int(revert_radius))
else:
self.revert_detector = revert_detector
# Stores the last tokens
self.last = None
def process(self, text, revision=None, checksum=None):
"""
Modifies the internal state based a change to the content and returns
the sets of words added and removed.
:Parameters:
text : str
The text content of a revision
revision : `mixed`
Revision meta data
checksum : str
A checksum hash of the text content (will be generated if not provided)
:Returns:
Three :class:`~mw.lib.persistence.Tokens` lists
current_tokens : :class:`~mw.lib.persistence.Tokens`
A sequence of :class:`~mw.lib.persistence.Token` for the
processed revision
tokens_added : :class:`~mw.lib.persistence.Tokens`
A set of tokens that were inserted by the processed revision
tokens_removed : :class:`~mw.lib.persistence.Tokens`
A sequence of :class:`~mw.lib.persistence.Token` removed by the
processed revision
"""
if checksum is None:
checksum = sha1(bytes(text, 'utf8')).hexdigest()
version = Version()
revert = self.revert_detector.process(checksum, version)
if revert is not None: # Revert
# Empty words.
tokens_added = Tokens()
tokens_removed = Tokens()
# Extract reverted_to revision
_, _, reverted_to = revert
version.tokens = reverted_to.tokens
else:
if self.last is None: # First version of the page!
version.tokens = Tokens(Token(t) for t in self.tokenize(text))
tokens_added = version.tokens
tokens_removed = Tokens()
else:
# NOTICE: HEAVY COMPUTATION HERE!!!
#
# OK. It's not that heavy. It's just performing a diff,
# but you're still going to spend most of your time here.
# Diffs usually run in O(n^2) -- O(n^3) time and most tokenizers
# produce a lot of tokens.
version.tokens, tokens_added, tokens_removed = \
self.last.tokens.compare(self.tokenize(text), self.diff)
version.tokens.persist(revision)
self.last = version
return version.tokens, tokens_added, tokens_removed

View File

@@ -0,0 +1,12 @@
from nose.tools import eq_
from .. import difference
def test_sequence_matcher():
t1 = "foobar derp hepl derpl"
t2 = "fooasldal 3 hepl asl a derpl"
ops = difference.sequence_matcher(t1, t2)
eq_("".join(difference.apply(ops, t1, t2)), t2)

View File

@@ -0,0 +1,25 @@
from nose.tools import eq_
from ..state import State
def test_state():
contents_revisions = [
("Apples are red.", 0),
("Apples are blue.", 1),
("Apples are red.", 2),
("Apples are tasty and red.", 3),
("Apples are tasty and blue.", 4)
]
state = State()
token_sets = [state.process(c, r) for c, r in contents_revisions]
for i, (content, revision) in enumerate(contents_revisions):
eq_("".join(token_sets[i][0].texts()), content)
eq_(token_sets[0][0][0].text, "Apples")
eq_(len(token_sets[0][0][0].revisions), 5)
eq_(token_sets[0][0][4].text, "red")
eq_(len(token_sets[0][0][4].revisions), 3)

View File

@@ -0,0 +1,10 @@
from nose.tools import eq_
from .. import tokenization
def test_wikitext_split():
eq_(
list(tokenization.wikitext_split("foo bar herp {{derp}}")),
["foo", " ", "bar", " ", "herp", " ", "{{", "derp", "}}"]
)

View File

@@ -0,0 +1,16 @@
import re
def wikitext_split(text):
"""
Performs the simplest possible split of latin character-based languages
and wikitext.
:Parameters:
text : str
Text to split.
"""
return re.findall(
r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
text
)

View File

@@ -0,0 +1,98 @@
class Token:
"""
Represents a chunk of text and the revisions of a page that it survived.
"""
__slots__ = ('text', 'revisions')
def __init__(self, text, revisions=None):
self.text = text
"""
The text of the token.
"""
self.revisions = revisions if revisions is not None else []
"""
The meta data for the revisions that the token has appeared within.
"""
def persist(self, revision):
self.revisions.append(revision)
def __repr__(self):
return "{0}({1})".format(
self.__class__.__name__,
", ".join([
"text={0}".format(repr(self.text)),
"revisions={0}".format(repr(self.revisions))
])
)
class Tokens(list):
"""
Represents a :class:`list` of :class:`~mw.lib.persistence.Token` with some
useful helper functions.
:Example:
>>> from mw.lib.persistence import Token, Tokens
>>>
>>> tokens = Tokens()
>>> tokens.append(Token("foo"))
>>> tokens.extend([Token(" "), Token("bar")])
>>>
>>> tokens[0]
Token(text='foo', revisions=[])
>>>
>>> "".join(tokens.texts())
'foo bar'
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def persist(self, revision):
for token in self:
token.persist(revision)
def texts(self):
for token in self:
yield token.text
def compare(self, new, diff):
old = self.texts()
return self.apply_diff(diff(old, new), self, new)
@classmethod
def apply_diff(cls, ops, old, new):
tokens = cls()
tokens_added = cls()
tokens_removed = cls()
for code, a_start, a_end, b_start, b_end in ops:
if code == "insert":
for token_text in new[b_start:b_end]:
token = Token(token_text)
tokens.append(token)
tokens_added.append(token)
elif code == "replace":
for token_text in new[b_start:b_end]:
token = Token(token_text)
tokens.append(token)
tokens_added.append(token)
tokens_removed.extend(t for t in old[a_start:a_end])
elif code == "equal":
tokens.extend(old[a_start:a_end])
elif code == "delete":
tokens_removed.extend(old[a_start:a_end])
else:
assert False, \
"encounted an unrecognized operation code: " + repr(code)
return (tokens, tokens_added, tokens_removed)

View File

@@ -0,0 +1,24 @@
"""
This module provides a set of utilities for detecting identity reverts in
revisioned content.
To detect reverts in a stream of revisions to a single page, you can use
:func:`detect`. If you'll be detecting reverts in a collection of pages or
would, for some other reason, prefer to process revisions one at a time,
:class:`Detector` and it's :meth:`~Detector.process` will allow you to do so.
To detect reverts one-at-time and arbitrarily, you can user the `check()`
functions:
* :func:`database.check` and :func:`database.check_row` use a :class:`mw.database.DB`
* :func:`api.check` and :func:`api.check_rev` use a :class:`mw.api.Session`
Note that these functions are less performant than detecting reverts in a
stream of page revisions. This can be practical when trying to identify
reverted revisions in a user's contribution history.
"""
from .detector import Detector, Revert
from .functions import detect, reverts
from . import database
from . import api
from . import defaults

View File

@@ -0,0 +1,134 @@
from itertools import chain
from . import defaults
from ...types import Timestamp
from ...util import none_or
from .dummy_checksum import DummyChecksum
from .functions import detect
def check_rev(session, rev, **kwargs):
"""
Checks whether a revision (database row) was reverted (identity) and returns
a named tuple of Revert(reverting, reverteds, reverted_to).
:Parameters:
session : :class:`mw.api.Session`
An API session to make use of
rev : dict
a revision dict containing 'revid' and 'page.id'
radius : int
a positive integer indicating the maximum number of revisions that can be reverted
before : :class:`mw.Timestamp`
if set, limits the search for *reverting* revisions to those which were saved before this timestamp
properties : set( str )
a set of properties to include in revisions (see :class:`mw.api.Revisions`)
"""
# extract rev_id, sha1, page_id
if 'revid' in rev:
rev_id = rev['revid']
else:
raise TypeError("rev must have 'rev_id'")
if 'page' in rev:
page_id = rev['page']['id']
elif 'pageid' in rev:
page_id = rev['pageid']
else:
raise TypeError("rev must have 'page' or 'pageid'")
# run the regular check
return check(session, rev_id, page_id=page_id, **kwargs)
def check(session, rev_id, page_id=None, radius=defaults.RADIUS,
before=None, window=None, properties=None):
"""
Checks whether a revision was reverted (identity) and returns a named tuple
of Revert(reverting, reverteds, reverted_to).
:Parameters:
session : :class:`mw.api.Session`
An API session to make use of
rev_id : int
the ID of the revision to check
page_id : int
the ID of the page the revision occupies (slower if not provided)
radius : int
a positive integer indicating the maximum number of revisions
that can be reverted
before : :class:`mw.Timestamp`
if set, limits the search for *reverting* revisions to those which
were saved before this timestamp
window : int
if set, limits the search for *reverting* revisions to those which
were saved within `window` seconds after the reverted edit
properties : set( str )
a set of properties to include in revisions (see :class:`mw.api.Revisions`)
"""
if not hasattr(session, "revisions"):
raise TypeError("session wrong type. Expected a mw.api.Session.")
rev_id = int(rev_id)
radius = int(radius)
if radius < 1:
raise TypeError("invalid radius. Expected a positive integer.")
page_id = none_or(page_id, int)
before = none_or(before, Timestamp)
properties = set(properties) if properties is not None else set()
# If we don't have the page_id, we're going to need to look them up
if page_id is None:
rev = session.revisions.get(rev_id, properties={'ids'})
page_id = rev['page']['pageid']
# Load history and current rev
current_and_past_revs = list(session.revisions.query(
pageids={page_id},
limit=radius + 1,
start_id=rev_id,
direction="older",
properties={'ids', 'timestamp', 'sha1'} | properties
))
try:
# Extract current rev and reorder history
current_rev, past_revs = (
current_and_past_revs[0], # Current rev is the first one returned
reversed(current_and_past_revs[1:]) # The rest are past revs, but they are in the wrong order
)
except IndexError:
# Only way to get here is if there isn't enough history. Couldn't be
# reverted. Just return None.
return None
if window is not None and before is None:
before = Timestamp(current_rev['timestamp']) + window
# Load future revisions
future_revs = session.revisions.query(
pageids={page_id},
limit=radius,
start_id=rev_id + 1, # Ensures that we skip the current revision
end=before,
direction="newer",
properties={'ids', 'timestamp', 'sha1'} | properties
)
# Convert to an iterable of (checksum, rev) pairs for detect() to consume
checksum_revisions = chain(
((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
for rev in past_revs),
[(current_rev.get('sha1', DummyChecksum()), current_rev)],
((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
for rev in future_revs),
)
for revert in detect(checksum_revisions, radius=radius):
# Check that this is a relevant revert
if rev_id in [rev['revid'] for rev in revert.reverteds]:
return revert
return None

View File

@@ -0,0 +1,148 @@
import random
from itertools import chain
from . import defaults
from ...types import Timestamp
from ...util import none_or
from .dummy_checksum import DummyChecksum
from .functions import detect
HEX = "1234567890abcdef"
def random_sha1():
return ''.join(random.choice(HEX) for i in range(40))
"""
Simple constant used in order to not do weird things with a dummy revision.
"""
def check_row(db, rev_row, **kwargs):
"""
Checks whether a revision (database row) was reverted (identity) and returns
a named tuple of Revert(reverting, reverteds, reverted_to).
:Parameters:
db : :class:`mw.database.DB`
A database connection to make use of.
rev_row : dict
a revision row containing 'rev_id' and 'rev_page' or 'page_id'
radius : int
a positive integer indicating the the maximum number of revisions that can be reverted
check_archive : bool
should the archive table be checked for reverting revisions?
before : `Timestamp`
if set, limits the search for *reverting* revisions to those which were saved before this timestamp
"""
# extract rev_id, sha1, page_id
if 'rev_id' in rev_row:
rev_id = rev_row['rev_id']
else:
raise TypeError("rev_row must have 'rev_id'")
if 'page_id' in rev_row:
page_id = rev_row['page_id']
elif 'rev_page' in rev_row:
page_id = rev_row['rev_page']
else:
raise TypeError("rev_row must have 'page_id' or 'rev_page'")
# run the regular check
return check(db, rev_id, page_id=page_id, **kwargs)
def check(db, rev_id, page_id=None, radius=defaults.RADIUS, check_archive=False,
before=None, window=None):
"""
Checks whether a revision was reverted (identity) and returns a named tuple
of Revert(reverting, reverteds, reverted_to).
:Parameters:
db : `mw.database.DB`
A database connection to make use of.
rev_id : int
the ID of the revision to check
page_id : int
the ID of the page the revision occupies (slower if not provided)
radius : int
a positive integer indicating the maximum number of revisions that can be reverted
check_archive : bool
should the archive table be checked for reverting revisions?
before : `Timestamp`
if set, limits the search for *reverting* revisions to those which were saved before this timestamp
window : int
if set, limits the search for *reverting* revisions to those which
were saved within `window` seconds after the reverted edit
"""
if not hasattr(db, "revisions") and hasattr(db, "all_revisions"):
raise TypeError("db wrong type. Expected a mw.database.DB.")
rev_id = int(rev_id)
radius = int(radius)
if radius < 1:
raise TypeError("invalid radius. Expected a positive integer.")
page_id = none_or(page_id, int)
check_archive = bool(check_archive)
before = none_or(before, Timestamp)
# If we are searching the archive, we'll need to use `all_revisions`.
if check_archive:
dbrevs = db.all_revisions
else:
dbrevs = db.revisions
# If we don't have the sha1 or page_id, we're going to need to look them up
if page_id is None:
row = dbrevs.get(rev_id=rev_id)
page_id = row['rev_page']
# Load history and current rev
current_and_past_revs = list(dbrevs.query(
page_id=page_id,
limit=radius + 1,
before_id=rev_id + 1, # Ensures that we capture the current revision
direction="older"
))
try:
# Extract current rev and reorder history
current_rev, past_revs = (
current_and_past_revs[0], # Current rev is the first one returned
reversed(current_and_past_revs[1:]) # The rest are past revs, but they are in the wrong order
)
except IndexError:
# Only way to get here is if there isn't enough history. Couldn't be
# reverted. Just return None.
return None
if window is not None and before is None:
before = Timestamp(current_rev['rev_timestamp']) + window
# Load future revisions
future_revs = dbrevs.query(
page_id=page_id,
limit=radius,
after_id=rev_id,
before=before,
direction="newer"
)
# Convert to an iterable of (checksum, rev) pairs for detect() to consume
checksum_revisions = chain(
((rev['rev_sha1'] if rev['rev_sha1'] is not None \
else DummyChecksum(), rev)
for rev in past_revs),
[(current_rev['rev_sha1'] or DummyChecksum(), current_rev)],
((rev['rev_sha1'] if rev['rev_sha1'] is not None \
else DummyChecksum(), rev)
for rev in future_revs)
)
for revert in detect(checksum_revisions, radius=radius):
# Check that this is a relevant revert
if rev_id in [rev['rev_id'] for rev in revert.reverteds]:
return revert
return None

View File

@@ -0,0 +1,24 @@
RADIUS = 15
"""
TODO: Better documentation here. For the time being, see:
Priedhorsky, R., Chen, J., Lam, S. T. K., Panciera, K., Terveen, L., &
Riedl, J. (2007, November). Creating, destroying, and restoring value in
Wikipedia. In Proceedings of the 2007 international ACM conference on
Supporting group work (pp. 259-268). ACM.
"""
class DUMMY_SHA1: pass
"""
Used in when checking for reverts when the checksum of the revision of interest
is unknown.
>>> DUMMY_SHA1 in {"aaa", "bbb"} # or any 40 character hex
False
>>>
>>> DUMMY_SHA1 == DUMMY_SHA1
True
>>> {DUMMY_SHA1, DUMMY_SHA1}
{<class '__main__.DUMMY_SHA1'>}
"""

View File

@@ -0,0 +1,83 @@
from collections import namedtuple
from ...util import ordered
from . import defaults
Revert = namedtuple("Revert", ['reverting', 'reverteds', 'reverted_to'])
"""
Represents a revert event. This class behaves like
:class:`collections.namedtuple`. Note that the datatypes of `reverting`,
`reverteds` and `reverted_to` is not specified since those types will depend
on the revision data provided during revert detection.
:Members:
**reverting**
The reverting revision data : `mixed`
**reverteds**
The reverted revision data (ordered chronologically) : list( `mixed` )
**reverted_to**
The reverted-to revision data : `mixed`
"""
class Detector(ordered.HistoricalMap):
"""
Detects revert events in a stream of revisions (to the same page) based on
matching checksums. To detect reverts, construct an instance of this class and call
:meth:`process` in chronological order (``direction == "newer"``).
See `<https://meta.wikimedia.org/wiki/R:Identity_revert>`_
:Parameters:
radius : int
a positive integer indicating the maximum revision distance that a revert can span.
:Example:
>>> from mw.lib import reverts
>>> detector = reverts.Detector()
>>>
>>> detector.process("aaa", {'rev_id': 1})
>>> detector.process("bbb", {'rev_id': 2})
>>> detector.process("aaa", {'rev_id': 3})
Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})
>>> detector.process("ccc", {'rev_id': 4})
"""
def __init__(self, radius=defaults.RADIUS):
"""
:Parameters:
radius : int
a positive integer indicating the maximum revision distance that a revert can span.
"""
if radius < 1:
raise TypeError("invalid radius. Expected a positive integer.")
super().__init__(maxlen=radius + 1)
def process(self, checksum, revision=None):
"""
Process a new revision and detect a revert if it occurred. Note that
you can pass whatever you like as `revision` and it will be returned in
the case that a revert occurs.
:Parameters:
checksum : str
Any identity-machable string-based hash of revision content
revision : `mixed`
Revision meta data. Note that any data will just be returned in the
case of a revert.
:Returns:
a :class:`~mw.lib.reverts.Revert` if one occured or `None`
"""
revert = None
if checksum in self: # potential revert
reverteds = list(self.up_to(checksum))
if len(reverteds) > 0: # If no reverted revisions, this is a noop
revert = Revert(revision, reverteds, self[checksum])
self.insert(checksum, revision)
return revert

View File

@@ -0,0 +1,24 @@
class DummyChecksum():
"""
Used in when checking for reverts when the checksum of the revision of interest
is unknown. DummyChecksums won't match eachother or anything else, but they
will match themselves and they are hashable.
>>> dummy1 = DummyChecksum()
>>> dummy1
<#140687347334280>
>>> dummy1 == dummy1
True
>>>
>>> dummy2 = DummyChecksum()
>>> dummy2
<#140687347334504>
>>> dummy1 == dummy2
False
>>>
>>> {"foo", "bar", dummy1, dummy1, dummy2}
{<#140687347334280>, 'foo', <#140687347334504>, 'bar'}
"""
def __str__(self): repr(self)
def __repr__(self): return "<#" + str(id(self)) + ">"

View File

@@ -0,0 +1,46 @@
from .detector import Detector
from . import defaults
def detect(checksum_revisions, radius=defaults.RADIUS):
"""
Detects reverts that occur in a sequence of revisions. Note that,
`revision` data meta will simply be returned in the case of a revert.
This function serves as a convenience wrapper around calls to
:class:`Detector`'s :meth:`~Detector.process`
method.
:Parameters:
checksum_revisions : iter( ( checksum : str, revision : `mixed` ) )
an iterable over tuples of checksum and revision meta data
radius : int
a positive integer indicating the maximum revision distance that a revert can span.
:Return:
a iterator over :class:`Revert`
:Example:
>>> from mw.lib import reverts
>>>
>>> checksum_revisions = [
... ("aaa", {'rev_id': 1}),
... ("bbb", {'rev_id': 2}),
... ("aaa", {'rev_id': 3}),
... ("ccc", {'rev_id': 4})
... ]
>>>
>>> list(reverts.detect(checksum_revisions))
[Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})]
"""
revert_detector = Detector(radius)
for checksum, revision in checksum_revisions:
revert = revert_detector.process(checksum, revision)
if revert is not None:
yield revert
# For backwards compatibility
reverts = detect

View File

@@ -0,0 +1,33 @@
from nose.tools import eq_
from ..detector import Detector
def test_detector():
detector = Detector(2)
eq_(detector.process("a", {'id': 1}), None)
# Check noop
eq_(detector.process("a", {'id': 2}), None)
# Short revert
eq_(detector.process("b", {'id': 3}), None)
eq_(
detector.process("a", {'id': 4}),
({'id': 4}, [{'id': 3}], {'id': 2})
)
# Medium revert
eq_(detector.process("c", {'id': 5}), None)
eq_(detector.process("d", {'id': 6}), None)
eq_(
detector.process("a", {'id': 7}),
({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
)
# Long (undetected) revert
eq_(detector.process("e", {'id': 8}), None)
eq_(detector.process("f", {'id': 9}), None)
eq_(detector.process("g", {'id': 10}), None)
eq_(detector.process("a", {'id': 11}), None)

View File

@@ -0,0 +1,23 @@
from nose.tools import eq_
from ..functions import reverts
def test_reverts():
checksum_revisions = [
("a", {'id': 1}),
("b", {'id': 2}),
("c", {'id': 3}),
("a", {'id': 4}),
("d", {'id': 5}),
("b", {'id': 6}),
("a", {'id': 7})
]
expected = [
({'id': 4}, [{'id': 3}, {'id': 2}], {'id': 1}),
({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
]
for revert in reverts(checksum_revisions, radius=2):
eq_(revert, expected.pop(0))

View File

@@ -0,0 +1,4 @@
from .functions import cluster, sessions
from .event import Event
from .cache import Cache, Session
from . import defaults

View File

@@ -0,0 +1,121 @@
import logging
from collections import namedtuple
from ...util import Heap
from ...types import Timestamp
from . import defaults
from .event import Event, unpack_events
logger = logging.getLogger("mw.lib.sessions.cache")
Session = namedtuple("Session", ["user", "events"])
"""
Represents a user session (a cluster over events for a user). This class
behaves like :class:`collections.namedtuple`. Note that the datatypes of
`events`, is not specified since those types will depend on the revision data
provided during revert detection.
:Members:
**user**
A hashable user identifier : `hashable`
**events**
A list of event data : list( `mixed` )
"""
class Cache:
"""
A cache of recent user session. Since sessions expire once activities stop
for at least `cutoff` seconds, this class manages a cache of *active*
sessions.
:Parameters:
cutoff : int
Maximum amount of time in seconds between session events
:Example:
>>> from mw.lib import sessions
>>>
>>> cache = sessions.Cache(cutoff=3600)
>>>
>>> list(cache.process("Willy on wheels", 100000, {'rev_id': 1}))
[]
>>> list(cache.process("Walter", 100001, {'rev_id': 2}))
[]
>>> list(cache.process("Willy on wheels", 100001, {'rev_id': 3}))
[]
>>> list(cache.process("Walter", 100035, {'rev_id': 4}))
[]
>>> list(cache.process("Willy on wheels", 103602, {'rev_id': 5}))
[Session(user='Willy on wheels', events=[{'rev_id': 1}, {'rev_id': 3}])]
>>> list(cache.get_active_sessions())
[Session(user='Walter', events=[{'rev_id': 2}, {'rev_id': 4}]), Session(user='Willy on wheels', events=[{'rev_id': 5}])]
"""
def __init__(self, cutoff=defaults.CUTOFF):
self.cutoff = int(cutoff)
self.recently_active = Heap()
self.active_users = {}
def process(self, user, timestamp, data=None):
"""
Processes a user event.
:Parameters:
user : `hashable`
A hashable value to identify a user (`int` or `str` are OK)
timestamp : :class:`mw.Timestamp`
The timestamp of the event
data : `mixed`
Event meta data
:Returns:
A generator of :class:`~mw.lib.sessions.Session` expired after
processing the user event.
"""
event = Event(user, Timestamp(timestamp), data)
for user, events in self._clear_expired(event.timestamp):
yield Session(user, unpack_events(events))
# Apply revision
if event.user in self.active_users:
events = self.active_users[event.user]
else:
events = []
self.active_users[event.user] = events
self.recently_active.push((event.timestamp, events))
events.append(event)
def get_active_sessions(self):
"""
Retrieves the active, unexpired sessions.
:Returns:
A generator of :class:`~mw.lib.sessions.Session`
"""
for last_timestamp, events in self.recently_active:
yield Session(events[-1].user, unpack_events(events))
def _clear_expired(self, timestamp):
# Cull old sessions
while (len(self.recently_active) > 0 and
timestamp - self.recently_active.peek()[0] >= self.cutoff):
_, events = self.recently_active.pop()
if timestamp - events[-1].timestamp >= self.cutoff:
del self.active_users[events[-1].user]
yield events[-1].user, events
else:
self.recently_active.push((events[-1].timestamp, events))
def __repr__(self):
return "%s(%s)".format(self.__class__.__name__, repr(self.cutoff))

View File

@@ -0,0 +1,6 @@
CUTOFF = 60 * 60
"""
TODO: Better documentation here.
For the time being, see
`<https://meta.wikimedia.org/wiki/Research:Edit_session>`_
"""

View File

@@ -0,0 +1,19 @@
import logging
from collections import namedtuple
logger = logging.getLogger("mw.lib.sessions.event")
# class Event:
# __slots__ = ('user', 'timestamp', 'data')
#
# def __init__(self, user, timestamp, data=None):
# self.user = user
# self.timestamp = Timestamp(timestamp)
# self.data = data
Event = namedtuple("Event", ['user', 'timestamp', 'data'])
def unpack_events(events):
return list(e.data for e in events)

View File

@@ -0,0 +1,68 @@
import logging
from .cache import Cache
from . import defaults
logger = logging.getLogger("mw.lib.sessions.functions")
def cluster(user_events, cutoff=defaults.CUTOFF):
"""
Clusters user sessions from a sequence of user events. Note that,
`event` data will simply be returned in the case of a revert.
This function serves as a convenience wrapper around calls to
:class:`~mw.lib.sessions.Cache`'s :meth:`~mw.lib.sessions.Cache.process`
method.
:Parameters:
user_events : iter( (user, timestamp, event) )
an iterable over tuples of user, timestamp and event data.
* user : `hashable`
* timestamp : :class:`mw.Timestamp`
* event : `mixed`
cutoff : int
the maximum time between events within a user session
:Returns:
a iterator over :class:`~mw.lib.sessions.Session`
:Example:
>>> from mw.lib import sessions
>>>
>>> user_events = [
... ("Willy on wheels", 100000, {'rev_id': 1}),
... ("Walter", 100001, {'rev_id': 2}),
... ("Willy on wheels", 100001, {'rev_id': 3}),
... ("Walter", 100035, {'rev_id': 4}),
... ("Willy on wheels", 103602, {'rev_id': 5})
... ]
>>>
>>> for user, events in sessions.cluster(user_events):
... (user, events)
...
('Willy on wheels', [{'rev_id': 1}, {'rev_id': 3}])
('Walter', [{'rev_id': 2}, {'rev_id': 4}])
('Willy on wheels', [{'rev_id': 5}])
"""
# Construct the session manager
cache = Cache(cutoff)
# Apply the events
for user, timestamp, event in user_events:
for session in cache.process(user, timestamp, event):
yield session
# Yield the left-overs
for session in cache.get_active_sessions():
yield session
# For backwards compatibility
sessions = cluster

View File

@@ -0,0 +1,22 @@
from nose.tools import eq_
from ..cache import Cache
def test_session_manager():
cache = Cache(cutoff=2)
user_sessions = list(cache.process("foo", 1))
eq_(user_sessions, [])
user_sessions = list(cache.process("bar", 2))
eq_(user_sessions, [])
user_sessions = list(cache.process("foo", 2))
eq_(user_sessions, [])
user_sessions = list(cache.process("bar", 10))
eq_(len(user_sessions), 2)
user_sessions = list(cache.get_active_sessions())
eq_(len(user_sessions), 1)

View File

@@ -0,0 +1,50 @@
from itertools import chain
from nose.tools import eq_
from .. import defaults
from ..functions import sessions
EVENTS = {
"foo": [
[
("foo", 1234567890, 1),
("foo", 1234567892, 2),
("foo", 1234567894, 3)
],
[
("foo", 1234567894 + defaults.CUTOFF, 4),
("foo", 1234567897 + defaults.CUTOFF, 5)
]
],
"bar": [
[
("bar", 1234567891, 6),
("bar", 1234567892, 7),
("bar", 1234567893, 8)
],
[
("bar", 1234567895 + defaults.CUTOFF, 9),
("bar", 1234567898 + defaults.CUTOFF, 0)
]
]
}
def test_group_events():
events = []
events.extend(chain(*EVENTS['foo']))
events.extend(chain(*EVENTS['bar']))
events.sort()
user_sessions = sessions(events)
counts = {
'foo': 0,
'bar': 0
}
for user, session in user_sessions:
eq_(list(e[2] for e in EVENTS[user][counts[user]]), list(session))
counts[user] += 1

View File

@@ -0,0 +1,2 @@
from .functions import normalize
from .parser import Parser

View File

@@ -0,0 +1,25 @@
def normalize(title):
"""
Normalizes a page title to the database format. E.g. spaces are converted
to underscores and the first character in the title is converted to
upper-case.
:Parameters:
title : str
A page title
:Returns:
The normalized title.
:Example:
>>> from mw.lib import title
>>>
>>> title.normalize("foo bar")
'Foo_bar'
"""
if title is None:
return title
else:
if len(title) > 0:
return (title[0].upper() + title[1:]).replace(" ", "_")
else:
return ""

View File

@@ -0,0 +1,171 @@
from ...types import Namespace
from ...util import autovivifying, none_or
from .functions import normalize
class Parser:
"""
Constructs a page name parser from a set of :class:`mw.Namespace`. Such a
parser can be used to convert a full page name (namespace included with a
colon; e.g, ``"Talk:Foo"``) into a namespace ID and
:func:`mw.lib.title.normalize`'d page title (e.g., ``(1, "Foo")``).
:Parameters:
namespaces : set( :class:`mw.Namespace` )
:Example:
>>> from mw import Namespace
>>> from mw.lib import title
>>>
>>> parser = title.Parser(
... [
... Namespace(0, "", case="first-letter"),
... Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
... Namespace(2, "Usu\u00e1rio(a)", canonical="User", aliases={"U"}, case="first-letter")
... ]
... )
>>>
>>> parser.parse("Discuss\u00e3o:Foo") # Using the standard name
(1, 'Foo')
>>> parser.parse("Talk:Foo bar") # Using the cannonical name
(1, 'Foo_bar')
>>> parser.parse("U:Foo bar") # Using an alias
(2, 'Foo_bar')
>>> parser.parse("Herpderp:Foo bar") # Psuedo namespace
(0, 'Herpderp:Foo_bar')
"""
def __init__(self, namespaces=None):
namespaces = none_or(namespaces, set)
self.ids = {}
self.names = {}
if namespaces is not None:
for namespace in namespaces:
self.add_namespace(namespace)
def parse(self, page_name):
"""
Parses a page name to extract the namespace.
:Parameters:
page_name : str
A page name including the namespace prefix and a colon (if not Main)
:Returns:
A tuple of (namespace : `int`, title : `str`)
"""
parts = page_name.split(":", 1)
if len(parts) == 1:
ns_id = 0
title = normalize(page_name)
else:
ns_name, title = parts
ns_name, title = normalize(ns_name), normalize(title)
if self.contains_name(ns_name):
ns_id = self.get_namespace(name=ns_name).id
else:
ns_id = 0
title = normalize(page_name)
return ns_id, title
def add_namespace(self, namespace):
"""
Adds a namespace to the parser.
:Parameters:
namespace : :class:`mw.Namespace`
A namespace
"""
self.ids[namespace.id] = namespace
self.names[namespace.name] = namespace
for alias in namespace.aliases:
self.names[alias] = namespace
if namespace.canonical is not None:
self.names[namespace.canonical] = namespace
def contains_name(self, name):
return normalize(name) in self.names
def get_namespace(self, id=None, name=None):
"""
Gets a namespace from the parser. Throws a :class:`KeyError` if a
namespace cannot be found.
:Parameters:
id : int
A namespace ID
name : str
A namespace name (standard, cannonical names and aliases
will be searched)
:Returns:
A :class:`mw.Namespace`.
"""
if id is not None:
return self.ids[int(id)]
else:
return self.names[normalize(name)]
@classmethod
def from_site_info(cls, si_doc):
"""
Constructs a parser from the result of a :meth:`mw.api.SiteInfo.query`.
:Parameters:
si_doc : dict
The result of a site_info request.
:Returns:
An initialized :class:`mw.lib.title.Parser`
"""
aliases = autovivifying.Dict(vivifier=lambda k: [])
# get aliases
if 'namespacealiases' in si_doc:
for alias_doc in si_doc['namespacealiases']:
aliases[alias_doc['id']].append(alias_doc['*'])
namespaces = []
for ns_doc in si_doc['namespaces'].values():
namespaces.append(
Namespace.from_doc(ns_doc, aliases)
)
return Parser(namespaces)
@classmethod
def from_api(cls, session):
"""
Constructs a parser from a :class:`mw.api.Session`
:Parameters:
session : :class:`mw.api.Session`
An open API session
:Returns:
An initialized :class:`mw.lib.title.Parser`
"""
si_doc = session.site_info.query(
properties={'namespaces', 'namespacealiases'}
)
return cls.from_site_info(si_doc)
@classmethod
def from_dump(cls, dump):
"""
Constructs a parser from a :class:`mw.xml_dump.Iterator`. Note that
XML database dumps do not include namespace aliases or cannonical names
so the parser that will be constructed will only work in common cases.
:Parameters:
dump : :class:`mw.xml_dump.Iterator`
An XML dump iterator
:Returns:
An initialized :class:`mw.lib.title.Parser`
"""
return cls(dump.namespaces)

View File

@@ -0,0 +1,10 @@
from nose.tools import eq_
from ..functions import normalize
def test_normalize():
eq_("Foobar", normalize("Foobar")) # Same
eq_("Foobar", normalize("foobar")) # Capitalize
eq_("FooBar", normalize("fooBar")) # Late capital
eq_("Foo_bar", normalize("Foo bar")) # Space

View File

@@ -0,0 +1,58 @@
from nose.tools import eq_
from ....types import Namespace
from ..parser import Parser
def test_simple():
parser = Parser(
[
Namespace(0, "", case="first-letter"),
Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
Namespace(2, "Usu\u00e1rio(a)", canonical="User", case="first-letter")
]
)
eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
def test_from_site_info():
parser = Parser.from_site_info(
{
"namespaces": {
"0": {
"id": 0,
"case": "first-letter",
"*": "",
"content": ""
},
"1": {
"id": 1,
"case": "first-letter",
"*": "Discuss\u00e3o",
"subpages": "",
"canonical": "Talk"
},
"2": {
"id": 2,
"case": "first-letter",
"*": "Usu\u00e1rio(a)",
"subpages": "",
"canonical": "User"
}
},
"namespacealiases": [
{
"id": 1,
"*": "WAFFLES"
}
]
}
)
eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
eq_((1, "Foo_bar"), parser.parse("WAFFLES:Foo bar"))