Initial commit
p# new file: runwikiq.sh
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
"""
|
||||
A package with utilities for managing the persistent word analysis across text
|
||||
versions of a document. `PersistenceState` is the highest level of the
|
||||
interface and the part of the system that's most interesting externally. `Word`s
|
||||
are also very important. The current implementation of `Word` only accounts for
|
||||
how the number of revisions in which a Word is visible. If persistent word
|
||||
views (or something similar) is intended to be kept, refactoring will be
|
||||
necessary.
|
||||
"""
|
||||
|
||||
from .state import State
|
||||
from .tokens import Tokens, Token
|
||||
from . import defaults
|
||||
from . import api
|
||||
@@ -0,0 +1,85 @@
|
||||
from .. import reverts
|
||||
from ...util import none_or
|
||||
from .state import State
|
||||
|
||||
|
||||
def track(session, rev_id, page_id=None, revert_radius=reverts.defaults.RADIUS,
|
||||
future_revisions=reverts.defaults.RADIUS, properties=None):
|
||||
"""
|
||||
Computes a persistence score for a revision by processing the revisions
|
||||
that took place around it.
|
||||
|
||||
:Parameters:
|
||||
session : :class:`mw.api.Session`
|
||||
An API session to make use of
|
||||
rev_id : int
|
||||
the ID of the revision to check
|
||||
page_id : int
|
||||
the ID of the page the revision occupies (slower if not provided)
|
||||
revert_radius : int
|
||||
a positive integer indicating the maximum number of revisions that can be reverted
|
||||
"""
|
||||
|
||||
if not hasattr(session, "revisions"):
|
||||
raise TypeError("session is wrong type. Expected a mw.api.Session.")
|
||||
|
||||
rev_id = int(rev_id)
|
||||
page_id = none_or(page_id, int)
|
||||
revert_radius = int(revert_radius)
|
||||
if revert_radius < 1:
|
||||
raise TypeError("invalid radius. Expected a positive integer.")
|
||||
properties = set(properties) if properties is not None else set()
|
||||
|
||||
|
||||
# If we don't have the page_id, we're going to need to look them up
|
||||
if page_id is None:
|
||||
rev = session.revisions.get(rev_id, properties={'ids'})
|
||||
page_id = rev['page']['pageid']
|
||||
|
||||
# Load history and current rev
|
||||
current_and_past_revs = list(session.revisions.query(
|
||||
pageids={page_id},
|
||||
limit=revert_radius + 1,
|
||||
start_id=rev_id,
|
||||
direction="older",
|
||||
properties={'ids', 'timestamp', 'content', 'sha1'} | properties
|
||||
))
|
||||
|
||||
try:
|
||||
# Extract current rev and reorder history
|
||||
current_rev, past_revs = (
|
||||
current_and_past_revs[0], # Current rev is the first one returned
|
||||
reversed(current_and_past_revs[1:]) # The rest are past revs, but they are in the wrong order
|
||||
)
|
||||
except IndexError:
|
||||
# Only way to get here is if there isn't enough history. Couldn't be
|
||||
# reverted. Just return None.
|
||||
return None
|
||||
|
||||
# Load future revisions
|
||||
future_revs = session.revisions.query(
|
||||
pageids={page_id},
|
||||
limit=future_revisions,
|
||||
start_id=rev_id + 1, # Ensures that we skip the current revision
|
||||
direction="newer",
|
||||
properties={'ids', 'timestamp', 'content', 'sha1'} | properties
|
||||
)
|
||||
|
||||
state = State(revert_radius=revert_radius)
|
||||
|
||||
# Process old revisions
|
||||
for rev in past_revs:
|
||||
state.process(rev.get('*', ""), rev, rev.get('sha1'))
|
||||
|
||||
# Process current revision
|
||||
_, tokens_added, _ = state.process(current_rev.get('*'), current_rev,
|
||||
current_rev.get('sha1'))
|
||||
|
||||
# Process new revisions
|
||||
future_revs = list(future_revs)
|
||||
for rev in future_revs:
|
||||
state.process(rev.get('*', ""), rev, rev.get('sha1'))
|
||||
|
||||
return current_rev, tokens_added, future_revs
|
||||
|
||||
score = track
|
||||
@@ -0,0 +1,11 @@
|
||||
from . import tokenization, difference
|
||||
|
||||
TOKENIZE = tokenization.wikitext_split
|
||||
"""
|
||||
The standard tokenizing function.
|
||||
"""
|
||||
|
||||
DIFF = difference.sequence_matcher
|
||||
"""
|
||||
The standard diff function
|
||||
"""
|
||||
@@ -0,0 +1,49 @@
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
|
||||
def sequence_matcher(old, new):
|
||||
"""
|
||||
Generates a sequence of operations using :class:`difflib.SequenceMatcher`.
|
||||
|
||||
:Parameters:
|
||||
old : list( `hashable` )
|
||||
Old tokens
|
||||
new : list( `hashable` )
|
||||
New tokens
|
||||
|
||||
Returns:
|
||||
Minimal operations needed to convert `old` to `new`
|
||||
"""
|
||||
sm = SequenceMatcher(None, list(old), list(new))
|
||||
return sm.get_opcodes()
|
||||
|
||||
|
||||
def apply(ops, old, new):
|
||||
"""
|
||||
Applies operations (delta) to copy items from `old` to `new`.
|
||||
|
||||
:Parameters:
|
||||
ops : list((op, a1, a2, b1, b2))
|
||||
Operations to perform
|
||||
old : list( `hashable` )
|
||||
Old tokens
|
||||
new : list( `hashable` )
|
||||
New tokens
|
||||
:Returns:
|
||||
An iterator over elements matching `new` but copied from `old`
|
||||
"""
|
||||
for code, a_start, a_end, b_start, b_end in ops:
|
||||
if code == "insert":
|
||||
for t in new[b_start:b_end]:
|
||||
yield t
|
||||
elif code == "replace":
|
||||
for t in new[b_start:b_end]:
|
||||
yield t
|
||||
elif code == "equal":
|
||||
for t in old[a_start:a_end]:
|
||||
yield t
|
||||
elif code == "delete":
|
||||
pass
|
||||
else:
|
||||
assert False, \
|
||||
"encounted an unrecognized operation code: " + repr(code)
|
||||
@@ -0,0 +1,149 @@
|
||||
from hashlib import sha1
|
||||
|
||||
from . import defaults
|
||||
from .. import reverts
|
||||
from .tokens import Token, Tokens
|
||||
|
||||
|
||||
class Version:
|
||||
__slots__ = ('tokens')
|
||||
|
||||
def __init__(self):
|
||||
self.tokens = None
|
||||
|
||||
|
||||
class State:
|
||||
"""
|
||||
Represents the state of word persistence in a page.
|
||||
See `<https://meta.wikimedia.org/wiki/Research:Content_persistence>`_
|
||||
|
||||
:Parameters:
|
||||
tokenize : function( `str` ) --> list( `str` )
|
||||
A tokenizing function
|
||||
diff : function(list( `str` ), list( `str` )) --> list( `ops` )
|
||||
A function to perform a difference between token lists
|
||||
revert_radius : int
|
||||
a positive integer indicating the maximum revision distance that a revert can span.
|
||||
revert_detector : :class:`mw.lib.reverts.Detector`
|
||||
a revert detector to start process with
|
||||
:Example:
|
||||
>>> from pprint import pprint
|
||||
>>> from mw.lib import persistence
|
||||
>>>
|
||||
>>> state = persistence.State()
|
||||
>>>
|
||||
>>> pprint(state.process("Apples are red.", revision=1))
|
||||
([Token(text='Apples', revisions=[1]),
|
||||
Token(text=' ', revisions=[1]),
|
||||
Token(text='are', revisions=[1]),
|
||||
Token(text=' ', revisions=[1]),
|
||||
Token(text='red', revisions=[1]),
|
||||
Token(text='.', revisions=[1])],
|
||||
[Token(text='Apples', revisions=[1]),
|
||||
Token(text=' ', revisions=[1]),
|
||||
Token(text='are', revisions=[1]),
|
||||
Token(text=' ', revisions=[1]),
|
||||
Token(text='red', revisions=[1]),
|
||||
Token(text='.', revisions=[1])],
|
||||
[])
|
||||
>>> pprint(state.process("Apples are blue.", revision=2))
|
||||
([Token(text='Apples', revisions=[1, 2]),
|
||||
Token(text=' ', revisions=[1, 2]),
|
||||
Token(text='are', revisions=[1, 2]),
|
||||
Token(text=' ', revisions=[1, 2]),
|
||||
Token(text='blue', revisions=[2]),
|
||||
Token(text='.', revisions=[1, 2])],
|
||||
[Token(text='blue', revisions=[2])],
|
||||
[Token(text='red', revisions=[1])])
|
||||
>>> pprint(state.process("Apples are red.", revision=3)) # A revert!
|
||||
([Token(text='Apples', revisions=[1, 2, 3]),
|
||||
Token(text=' ', revisions=[1, 2, 3]),
|
||||
Token(text='are', revisions=[1, 2, 3]),
|
||||
Token(text=' ', revisions=[1, 2, 3]),
|
||||
Token(text='red', revisions=[1, 3]),
|
||||
Token(text='.', revisions=[1, 2, 3])],
|
||||
[],
|
||||
[])
|
||||
"""
|
||||
|
||||
def __init__(self, tokenize=defaults.TOKENIZE, diff=defaults.DIFF,
|
||||
revert_radius=reverts.defaults.RADIUS,
|
||||
revert_detector=None):
|
||||
self.tokenize = tokenize
|
||||
self.diff = diff
|
||||
|
||||
# Either pass a detector or the revert radius so I can make one
|
||||
if revert_detector is None:
|
||||
self.revert_detector = reverts.Detector(int(revert_radius))
|
||||
else:
|
||||
self.revert_detector = revert_detector
|
||||
|
||||
# Stores the last tokens
|
||||
self.last = None
|
||||
|
||||
def process(self, text, revision=None, checksum=None):
|
||||
"""
|
||||
Modifies the internal state based a change to the content and returns
|
||||
the sets of words added and removed.
|
||||
|
||||
:Parameters:
|
||||
text : str
|
||||
The text content of a revision
|
||||
revision : `mixed`
|
||||
Revision meta data
|
||||
checksum : str
|
||||
A checksum hash of the text content (will be generated if not provided)
|
||||
|
||||
:Returns:
|
||||
Three :class:`~mw.lib.persistence.Tokens` lists
|
||||
|
||||
current_tokens : :class:`~mw.lib.persistence.Tokens`
|
||||
A sequence of :class:`~mw.lib.persistence.Token` for the
|
||||
processed revision
|
||||
tokens_added : :class:`~mw.lib.persistence.Tokens`
|
||||
A set of tokens that were inserted by the processed revision
|
||||
tokens_removed : :class:`~mw.lib.persistence.Tokens`
|
||||
A sequence of :class:`~mw.lib.persistence.Token` removed by the
|
||||
processed revision
|
||||
|
||||
"""
|
||||
if checksum is None:
|
||||
checksum = sha1(bytes(text, 'utf8')).hexdigest()
|
||||
|
||||
version = Version()
|
||||
|
||||
revert = self.revert_detector.process(checksum, version)
|
||||
if revert is not None: # Revert
|
||||
|
||||
# Empty words.
|
||||
tokens_added = Tokens()
|
||||
tokens_removed = Tokens()
|
||||
|
||||
# Extract reverted_to revision
|
||||
_, _, reverted_to = revert
|
||||
version.tokens = reverted_to.tokens
|
||||
|
||||
else:
|
||||
|
||||
if self.last is None: # First version of the page!
|
||||
|
||||
version.tokens = Tokens(Token(t) for t in self.tokenize(text))
|
||||
tokens_added = version.tokens
|
||||
tokens_removed = Tokens()
|
||||
|
||||
else:
|
||||
|
||||
# NOTICE: HEAVY COMPUTATION HERE!!!
|
||||
#
|
||||
# OK. It's not that heavy. It's just performing a diff,
|
||||
# but you're still going to spend most of your time here.
|
||||
# Diffs usually run in O(n^2) -- O(n^3) time and most tokenizers
|
||||
# produce a lot of tokens.
|
||||
version.tokens, tokens_added, tokens_removed = \
|
||||
self.last.tokens.compare(self.tokenize(text), self.diff)
|
||||
|
||||
version.tokens.persist(revision)
|
||||
|
||||
self.last = version
|
||||
|
||||
return version.tokens, tokens_added, tokens_removed
|
||||
@@ -0,0 +1,12 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from .. import difference
|
||||
|
||||
|
||||
def test_sequence_matcher():
|
||||
t1 = "foobar derp hepl derpl"
|
||||
t2 = "fooasldal 3 hepl asl a derpl"
|
||||
|
||||
ops = difference.sequence_matcher(t1, t2)
|
||||
|
||||
eq_("".join(difference.apply(ops, t1, t2)), t2)
|
||||
@@ -0,0 +1,25 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..state import State
|
||||
|
||||
|
||||
def test_state():
|
||||
contents_revisions = [
|
||||
("Apples are red.", 0),
|
||||
("Apples are blue.", 1),
|
||||
("Apples are red.", 2),
|
||||
("Apples are tasty and red.", 3),
|
||||
("Apples are tasty and blue.", 4)
|
||||
]
|
||||
|
||||
state = State()
|
||||
|
||||
token_sets = [state.process(c, r) for c, r in contents_revisions]
|
||||
|
||||
for i, (content, revision) in enumerate(contents_revisions):
|
||||
eq_("".join(token_sets[i][0].texts()), content)
|
||||
|
||||
eq_(token_sets[0][0][0].text, "Apples")
|
||||
eq_(len(token_sets[0][0][0].revisions), 5)
|
||||
eq_(token_sets[0][0][4].text, "red")
|
||||
eq_(len(token_sets[0][0][4].revisions), 3)
|
||||
@@ -0,0 +1,10 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from .. import tokenization
|
||||
|
||||
|
||||
def test_wikitext_split():
|
||||
eq_(
|
||||
list(tokenization.wikitext_split("foo bar herp {{derp}}")),
|
||||
["foo", " ", "bar", " ", "herp", " ", "{{", "derp", "}}"]
|
||||
)
|
||||
@@ -0,0 +1,16 @@
|
||||
import re
|
||||
|
||||
|
||||
def wikitext_split(text):
|
||||
"""
|
||||
Performs the simplest possible split of latin character-based languages
|
||||
and wikitext.
|
||||
|
||||
:Parameters:
|
||||
text : str
|
||||
Text to split.
|
||||
"""
|
||||
return re.findall(
|
||||
r"[\w]+|\[\[|\]\]|\{\{|\}\}|\n+| +|&\w+;|'''|''|=+|\{\||\|\}|\|\-|.",
|
||||
text
|
||||
)
|
||||
@@ -0,0 +1,98 @@
|
||||
class Token:
|
||||
"""
|
||||
Represents a chunk of text and the revisions of a page that it survived.
|
||||
"""
|
||||
__slots__ = ('text', 'revisions')
|
||||
|
||||
def __init__(self, text, revisions=None):
|
||||
self.text = text
|
||||
"""
|
||||
The text of the token.
|
||||
"""
|
||||
|
||||
self.revisions = revisions if revisions is not None else []
|
||||
"""
|
||||
The meta data for the revisions that the token has appeared within.
|
||||
"""
|
||||
|
||||
def persist(self, revision):
|
||||
self.revisions.append(revision)
|
||||
|
||||
def __repr__(self):
|
||||
return "{0}({1})".format(
|
||||
self.__class__.__name__,
|
||||
", ".join([
|
||||
"text={0}".format(repr(self.text)),
|
||||
"revisions={0}".format(repr(self.revisions))
|
||||
])
|
||||
)
|
||||
|
||||
|
||||
class Tokens(list):
|
||||
"""
|
||||
Represents a :class:`list` of :class:`~mw.lib.persistence.Token` with some
|
||||
useful helper functions.
|
||||
|
||||
:Example:
|
||||
|
||||
>>> from mw.lib.persistence import Token, Tokens
|
||||
>>>
|
||||
>>> tokens = Tokens()
|
||||
>>> tokens.append(Token("foo"))
|
||||
>>> tokens.extend([Token(" "), Token("bar")])
|
||||
>>>
|
||||
>>> tokens[0]
|
||||
Token(text='foo', revisions=[])
|
||||
>>>
|
||||
>>> "".join(tokens.texts())
|
||||
'foo bar'
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def persist(self, revision):
|
||||
for token in self:
|
||||
token.persist(revision)
|
||||
|
||||
def texts(self):
|
||||
for token in self:
|
||||
yield token.text
|
||||
|
||||
def compare(self, new, diff):
|
||||
old = self.texts()
|
||||
|
||||
return self.apply_diff(diff(old, new), self, new)
|
||||
|
||||
@classmethod
|
||||
def apply_diff(cls, ops, old, new):
|
||||
|
||||
tokens = cls()
|
||||
tokens_added = cls()
|
||||
tokens_removed = cls()
|
||||
|
||||
for code, a_start, a_end, b_start, b_end in ops:
|
||||
if code == "insert":
|
||||
for token_text in new[b_start:b_end]:
|
||||
token = Token(token_text)
|
||||
tokens.append(token)
|
||||
tokens_added.append(token)
|
||||
|
||||
elif code == "replace":
|
||||
for token_text in new[b_start:b_end]:
|
||||
token = Token(token_text)
|
||||
tokens.append(token)
|
||||
tokens_added.append(token)
|
||||
|
||||
tokens_removed.extend(t for t in old[a_start:a_end])
|
||||
|
||||
elif code == "equal":
|
||||
tokens.extend(old[a_start:a_end])
|
||||
elif code == "delete":
|
||||
tokens_removed.extend(old[a_start:a_end])
|
||||
|
||||
else:
|
||||
assert False, \
|
||||
"encounted an unrecognized operation code: " + repr(code)
|
||||
|
||||
return (tokens, tokens_added, tokens_removed)
|
||||
@@ -0,0 +1,24 @@
|
||||
"""
|
||||
This module provides a set of utilities for detecting identity reverts in
|
||||
revisioned content.
|
||||
|
||||
To detect reverts in a stream of revisions to a single page, you can use
|
||||
:func:`detect`. If you'll be detecting reverts in a collection of pages or
|
||||
would, for some other reason, prefer to process revisions one at a time,
|
||||
:class:`Detector` and it's :meth:`~Detector.process` will allow you to do so.
|
||||
|
||||
To detect reverts one-at-time and arbitrarily, you can user the `check()`
|
||||
functions:
|
||||
|
||||
* :func:`database.check` and :func:`database.check_row` use a :class:`mw.database.DB`
|
||||
* :func:`api.check` and :func:`api.check_rev` use a :class:`mw.api.Session`
|
||||
|
||||
Note that these functions are less performant than detecting reverts in a
|
||||
stream of page revisions. This can be practical when trying to identify
|
||||
reverted revisions in a user's contribution history.
|
||||
"""
|
||||
from .detector import Detector, Revert
|
||||
from .functions import detect, reverts
|
||||
from . import database
|
||||
from . import api
|
||||
from . import defaults
|
||||
134
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py
Normal file
134
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/reverts/api.py
Normal file
@@ -0,0 +1,134 @@
|
||||
from itertools import chain
|
||||
|
||||
from . import defaults
|
||||
from ...types import Timestamp
|
||||
from ...util import none_or
|
||||
from .dummy_checksum import DummyChecksum
|
||||
from .functions import detect
|
||||
|
||||
|
||||
def check_rev(session, rev, **kwargs):
|
||||
"""
|
||||
Checks whether a revision (database row) was reverted (identity) and returns
|
||||
a named tuple of Revert(reverting, reverteds, reverted_to).
|
||||
|
||||
:Parameters:
|
||||
session : :class:`mw.api.Session`
|
||||
An API session to make use of
|
||||
rev : dict
|
||||
a revision dict containing 'revid' and 'page.id'
|
||||
radius : int
|
||||
a positive integer indicating the maximum number of revisions that can be reverted
|
||||
before : :class:`mw.Timestamp`
|
||||
if set, limits the search for *reverting* revisions to those which were saved before this timestamp
|
||||
properties : set( str )
|
||||
a set of properties to include in revisions (see :class:`mw.api.Revisions`)
|
||||
"""
|
||||
|
||||
# extract rev_id, sha1, page_id
|
||||
if 'revid' in rev:
|
||||
rev_id = rev['revid']
|
||||
else:
|
||||
raise TypeError("rev must have 'rev_id'")
|
||||
if 'page' in rev:
|
||||
page_id = rev['page']['id']
|
||||
elif 'pageid' in rev:
|
||||
page_id = rev['pageid']
|
||||
else:
|
||||
raise TypeError("rev must have 'page' or 'pageid'")
|
||||
|
||||
# run the regular check
|
||||
return check(session, rev_id, page_id=page_id, **kwargs)
|
||||
|
||||
|
||||
def check(session, rev_id, page_id=None, radius=defaults.RADIUS,
|
||||
before=None, window=None, properties=None):
|
||||
"""
|
||||
Checks whether a revision was reverted (identity) and returns a named tuple
|
||||
of Revert(reverting, reverteds, reverted_to).
|
||||
|
||||
:Parameters:
|
||||
session : :class:`mw.api.Session`
|
||||
An API session to make use of
|
||||
rev_id : int
|
||||
the ID of the revision to check
|
||||
page_id : int
|
||||
the ID of the page the revision occupies (slower if not provided)
|
||||
radius : int
|
||||
a positive integer indicating the maximum number of revisions
|
||||
that can be reverted
|
||||
before : :class:`mw.Timestamp`
|
||||
if set, limits the search for *reverting* revisions to those which
|
||||
were saved before this timestamp
|
||||
window : int
|
||||
if set, limits the search for *reverting* revisions to those which
|
||||
were saved within `window` seconds after the reverted edit
|
||||
properties : set( str )
|
||||
a set of properties to include in revisions (see :class:`mw.api.Revisions`)
|
||||
"""
|
||||
|
||||
if not hasattr(session, "revisions"):
|
||||
raise TypeError("session wrong type. Expected a mw.api.Session.")
|
||||
|
||||
rev_id = int(rev_id)
|
||||
radius = int(radius)
|
||||
if radius < 1:
|
||||
raise TypeError("invalid radius. Expected a positive integer.")
|
||||
|
||||
page_id = none_or(page_id, int)
|
||||
before = none_or(before, Timestamp)
|
||||
properties = set(properties) if properties is not None else set()
|
||||
|
||||
# If we don't have the page_id, we're going to need to look them up
|
||||
if page_id is None:
|
||||
rev = session.revisions.get(rev_id, properties={'ids'})
|
||||
page_id = rev['page']['pageid']
|
||||
|
||||
# Load history and current rev
|
||||
current_and_past_revs = list(session.revisions.query(
|
||||
pageids={page_id},
|
||||
limit=radius + 1,
|
||||
start_id=rev_id,
|
||||
direction="older",
|
||||
properties={'ids', 'timestamp', 'sha1'} | properties
|
||||
))
|
||||
|
||||
try:
|
||||
# Extract current rev and reorder history
|
||||
current_rev, past_revs = (
|
||||
current_and_past_revs[0], # Current rev is the first one returned
|
||||
reversed(current_and_past_revs[1:]) # The rest are past revs, but they are in the wrong order
|
||||
)
|
||||
except IndexError:
|
||||
# Only way to get here is if there isn't enough history. Couldn't be
|
||||
# reverted. Just return None.
|
||||
return None
|
||||
|
||||
if window is not None and before is None:
|
||||
before = Timestamp(current_rev['timestamp']) + window
|
||||
|
||||
# Load future revisions
|
||||
future_revs = session.revisions.query(
|
||||
pageids={page_id},
|
||||
limit=radius,
|
||||
start_id=rev_id + 1, # Ensures that we skip the current revision
|
||||
end=before,
|
||||
direction="newer",
|
||||
properties={'ids', 'timestamp', 'sha1'} | properties
|
||||
)
|
||||
|
||||
# Convert to an iterable of (checksum, rev) pairs for detect() to consume
|
||||
checksum_revisions = chain(
|
||||
((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
|
||||
for rev in past_revs),
|
||||
[(current_rev.get('sha1', DummyChecksum()), current_rev)],
|
||||
((rev['sha1'] if 'sha1' in rev else DummyChecksum(), rev)
|
||||
for rev in future_revs),
|
||||
)
|
||||
|
||||
for revert in detect(checksum_revisions, radius=radius):
|
||||
# Check that this is a relevant revert
|
||||
if rev_id in [rev['revid'] for rev in revert.reverteds]:
|
||||
return revert
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,148 @@
|
||||
import random
|
||||
from itertools import chain
|
||||
|
||||
from . import defaults
|
||||
from ...types import Timestamp
|
||||
from ...util import none_or
|
||||
from .dummy_checksum import DummyChecksum
|
||||
from .functions import detect
|
||||
|
||||
HEX = "1234567890abcdef"
|
||||
|
||||
def random_sha1():
|
||||
return ''.join(random.choice(HEX) for i in range(40))
|
||||
|
||||
"""
|
||||
Simple constant used in order to not do weird things with a dummy revision.
|
||||
"""
|
||||
|
||||
|
||||
def check_row(db, rev_row, **kwargs):
|
||||
"""
|
||||
Checks whether a revision (database row) was reverted (identity) and returns
|
||||
a named tuple of Revert(reverting, reverteds, reverted_to).
|
||||
|
||||
:Parameters:
|
||||
db : :class:`mw.database.DB`
|
||||
A database connection to make use of.
|
||||
rev_row : dict
|
||||
a revision row containing 'rev_id' and 'rev_page' or 'page_id'
|
||||
radius : int
|
||||
a positive integer indicating the the maximum number of revisions that can be reverted
|
||||
check_archive : bool
|
||||
should the archive table be checked for reverting revisions?
|
||||
before : `Timestamp`
|
||||
if set, limits the search for *reverting* revisions to those which were saved before this timestamp
|
||||
"""
|
||||
|
||||
# extract rev_id, sha1, page_id
|
||||
if 'rev_id' in rev_row:
|
||||
rev_id = rev_row['rev_id']
|
||||
else:
|
||||
raise TypeError("rev_row must have 'rev_id'")
|
||||
if 'page_id' in rev_row:
|
||||
page_id = rev_row['page_id']
|
||||
elif 'rev_page' in rev_row:
|
||||
page_id = rev_row['rev_page']
|
||||
else:
|
||||
raise TypeError("rev_row must have 'page_id' or 'rev_page'")
|
||||
|
||||
# run the regular check
|
||||
return check(db, rev_id, page_id=page_id, **kwargs)
|
||||
|
||||
|
||||
def check(db, rev_id, page_id=None, radius=defaults.RADIUS, check_archive=False,
|
||||
before=None, window=None):
|
||||
|
||||
"""
|
||||
Checks whether a revision was reverted (identity) and returns a named tuple
|
||||
of Revert(reverting, reverteds, reverted_to).
|
||||
|
||||
:Parameters:
|
||||
db : `mw.database.DB`
|
||||
A database connection to make use of.
|
||||
rev_id : int
|
||||
the ID of the revision to check
|
||||
page_id : int
|
||||
the ID of the page the revision occupies (slower if not provided)
|
||||
radius : int
|
||||
a positive integer indicating the maximum number of revisions that can be reverted
|
||||
check_archive : bool
|
||||
should the archive table be checked for reverting revisions?
|
||||
before : `Timestamp`
|
||||
if set, limits the search for *reverting* revisions to those which were saved before this timestamp
|
||||
window : int
|
||||
if set, limits the search for *reverting* revisions to those which
|
||||
were saved within `window` seconds after the reverted edit
|
||||
"""
|
||||
|
||||
if not hasattr(db, "revisions") and hasattr(db, "all_revisions"):
|
||||
raise TypeError("db wrong type. Expected a mw.database.DB.")
|
||||
|
||||
rev_id = int(rev_id)
|
||||
radius = int(radius)
|
||||
if radius < 1:
|
||||
raise TypeError("invalid radius. Expected a positive integer.")
|
||||
page_id = none_or(page_id, int)
|
||||
check_archive = bool(check_archive)
|
||||
before = none_or(before, Timestamp)
|
||||
|
||||
# If we are searching the archive, we'll need to use `all_revisions`.
|
||||
if check_archive:
|
||||
dbrevs = db.all_revisions
|
||||
else:
|
||||
dbrevs = db.revisions
|
||||
|
||||
# If we don't have the sha1 or page_id, we're going to need to look them up
|
||||
if page_id is None:
|
||||
row = dbrevs.get(rev_id=rev_id)
|
||||
page_id = row['rev_page']
|
||||
|
||||
# Load history and current rev
|
||||
current_and_past_revs = list(dbrevs.query(
|
||||
page_id=page_id,
|
||||
limit=radius + 1,
|
||||
before_id=rev_id + 1, # Ensures that we capture the current revision
|
||||
direction="older"
|
||||
))
|
||||
|
||||
try:
|
||||
# Extract current rev and reorder history
|
||||
current_rev, past_revs = (
|
||||
current_and_past_revs[0], # Current rev is the first one returned
|
||||
reversed(current_and_past_revs[1:]) # The rest are past revs, but they are in the wrong order
|
||||
)
|
||||
except IndexError:
|
||||
# Only way to get here is if there isn't enough history. Couldn't be
|
||||
# reverted. Just return None.
|
||||
return None
|
||||
|
||||
if window is not None and before is None:
|
||||
before = Timestamp(current_rev['rev_timestamp']) + window
|
||||
|
||||
# Load future revisions
|
||||
future_revs = dbrevs.query(
|
||||
page_id=page_id,
|
||||
limit=radius,
|
||||
after_id=rev_id,
|
||||
before=before,
|
||||
direction="newer"
|
||||
)
|
||||
|
||||
# Convert to an iterable of (checksum, rev) pairs for detect() to consume
|
||||
checksum_revisions = chain(
|
||||
((rev['rev_sha1'] if rev['rev_sha1'] is not None \
|
||||
else DummyChecksum(), rev)
|
||||
for rev in past_revs),
|
||||
[(current_rev['rev_sha1'] or DummyChecksum(), current_rev)],
|
||||
((rev['rev_sha1'] if rev['rev_sha1'] is not None \
|
||||
else DummyChecksum(), rev)
|
||||
for rev in future_revs)
|
||||
)
|
||||
|
||||
for revert in detect(checksum_revisions, radius=radius):
|
||||
# Check that this is a relevant revert
|
||||
if rev_id in [rev['rev_id'] for rev in revert.reverteds]:
|
||||
return revert
|
||||
|
||||
return None
|
||||
@@ -0,0 +1,24 @@
|
||||
RADIUS = 15
|
||||
"""
|
||||
TODO: Better documentation here. For the time being, see:
|
||||
|
||||
Priedhorsky, R., Chen, J., Lam, S. T. K., Panciera, K., Terveen, L., &
|
||||
Riedl, J. (2007, November). Creating, destroying, and restoring value in
|
||||
Wikipedia. In Proceedings of the 2007 international ACM conference on
|
||||
Supporting group work (pp. 259-268). ACM.
|
||||
"""
|
||||
|
||||
|
||||
class DUMMY_SHA1: pass
|
||||
"""
|
||||
Used in when checking for reverts when the checksum of the revision of interest
|
||||
is unknown.
|
||||
|
||||
>>> DUMMY_SHA1 in {"aaa", "bbb"} # or any 40 character hex
|
||||
False
|
||||
>>>
|
||||
>>> DUMMY_SHA1 == DUMMY_SHA1
|
||||
True
|
||||
>>> {DUMMY_SHA1, DUMMY_SHA1}
|
||||
{<class '__main__.DUMMY_SHA1'>}
|
||||
"""
|
||||
@@ -0,0 +1,83 @@
|
||||
from collections import namedtuple
|
||||
|
||||
from ...util import ordered
|
||||
from . import defaults
|
||||
|
||||
Revert = namedtuple("Revert", ['reverting', 'reverteds', 'reverted_to'])
|
||||
"""
|
||||
Represents a revert event. This class behaves like
|
||||
:class:`collections.namedtuple`. Note that the datatypes of `reverting`,
|
||||
`reverteds` and `reverted_to` is not specified since those types will depend
|
||||
on the revision data provided during revert detection.
|
||||
|
||||
:Members:
|
||||
**reverting**
|
||||
The reverting revision data : `mixed`
|
||||
**reverteds**
|
||||
The reverted revision data (ordered chronologically) : list( `mixed` )
|
||||
**reverted_to**
|
||||
The reverted-to revision data : `mixed`
|
||||
"""
|
||||
|
||||
|
||||
class Detector(ordered.HistoricalMap):
|
||||
"""
|
||||
Detects revert events in a stream of revisions (to the same page) based on
|
||||
matching checksums. To detect reverts, construct an instance of this class and call
|
||||
:meth:`process` in chronological order (``direction == "newer"``).
|
||||
|
||||
See `<https://meta.wikimedia.org/wiki/R:Identity_revert>`_
|
||||
|
||||
:Parameters:
|
||||
radius : int
|
||||
a positive integer indicating the maximum revision distance that a revert can span.
|
||||
|
||||
:Example:
|
||||
>>> from mw.lib import reverts
|
||||
>>> detector = reverts.Detector()
|
||||
>>>
|
||||
>>> detector.process("aaa", {'rev_id': 1})
|
||||
>>> detector.process("bbb", {'rev_id': 2})
|
||||
>>> detector.process("aaa", {'rev_id': 3})
|
||||
Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})
|
||||
>>> detector.process("ccc", {'rev_id': 4})
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, radius=defaults.RADIUS):
|
||||
"""
|
||||
:Parameters:
|
||||
radius : int
|
||||
a positive integer indicating the maximum revision distance that a revert can span.
|
||||
"""
|
||||
if radius < 1:
|
||||
raise TypeError("invalid radius. Expected a positive integer.")
|
||||
super().__init__(maxlen=radius + 1)
|
||||
|
||||
def process(self, checksum, revision=None):
|
||||
"""
|
||||
Process a new revision and detect a revert if it occurred. Note that
|
||||
you can pass whatever you like as `revision` and it will be returned in
|
||||
the case that a revert occurs.
|
||||
|
||||
:Parameters:
|
||||
checksum : str
|
||||
Any identity-machable string-based hash of revision content
|
||||
revision : `mixed`
|
||||
Revision meta data. Note that any data will just be returned in the
|
||||
case of a revert.
|
||||
|
||||
:Returns:
|
||||
a :class:`~mw.lib.reverts.Revert` if one occured or `None`
|
||||
"""
|
||||
revert = None
|
||||
|
||||
if checksum in self: # potential revert
|
||||
|
||||
reverteds = list(self.up_to(checksum))
|
||||
|
||||
if len(reverteds) > 0: # If no reverted revisions, this is a noop
|
||||
revert = Revert(revision, reverteds, self[checksum])
|
||||
|
||||
self.insert(checksum, revision)
|
||||
return revert
|
||||
@@ -0,0 +1,24 @@
|
||||
class DummyChecksum():
|
||||
"""
|
||||
Used in when checking for reverts when the checksum of the revision of interest
|
||||
is unknown. DummyChecksums won't match eachother or anything else, but they
|
||||
will match themselves and they are hashable.
|
||||
|
||||
>>> dummy1 = DummyChecksum()
|
||||
>>> dummy1
|
||||
<#140687347334280>
|
||||
>>> dummy1 == dummy1
|
||||
True
|
||||
>>>
|
||||
>>> dummy2 = DummyChecksum()
|
||||
>>> dummy2
|
||||
<#140687347334504>
|
||||
>>> dummy1 == dummy2
|
||||
False
|
||||
>>>
|
||||
>>> {"foo", "bar", dummy1, dummy1, dummy2}
|
||||
{<#140687347334280>, 'foo', <#140687347334504>, 'bar'}
|
||||
"""
|
||||
|
||||
def __str__(self): repr(self)
|
||||
def __repr__(self): return "<#" + str(id(self)) + ">"
|
||||
@@ -0,0 +1,46 @@
|
||||
from .detector import Detector
|
||||
from . import defaults
|
||||
|
||||
|
||||
def detect(checksum_revisions, radius=defaults.RADIUS):
|
||||
"""
|
||||
Detects reverts that occur in a sequence of revisions. Note that,
|
||||
`revision` data meta will simply be returned in the case of a revert.
|
||||
|
||||
This function serves as a convenience wrapper around calls to
|
||||
:class:`Detector`'s :meth:`~Detector.process`
|
||||
method.
|
||||
|
||||
:Parameters:
|
||||
checksum_revisions : iter( ( checksum : str, revision : `mixed` ) )
|
||||
an iterable over tuples of checksum and revision meta data
|
||||
radius : int
|
||||
a positive integer indicating the maximum revision distance that a revert can span.
|
||||
|
||||
:Return:
|
||||
a iterator over :class:`Revert`
|
||||
|
||||
:Example:
|
||||
>>> from mw.lib import reverts
|
||||
>>>
|
||||
>>> checksum_revisions = [
|
||||
... ("aaa", {'rev_id': 1}),
|
||||
... ("bbb", {'rev_id': 2}),
|
||||
... ("aaa", {'rev_id': 3}),
|
||||
... ("ccc", {'rev_id': 4})
|
||||
... ]
|
||||
>>>
|
||||
>>> list(reverts.detect(checksum_revisions))
|
||||
[Revert(reverting={'rev_id': 3}, reverteds=[{'rev_id': 2}], reverted_to={'rev_id': 1})]
|
||||
|
||||
"""
|
||||
|
||||
revert_detector = Detector(radius)
|
||||
|
||||
for checksum, revision in checksum_revisions:
|
||||
revert = revert_detector.process(checksum, revision)
|
||||
if revert is not None:
|
||||
yield revert
|
||||
|
||||
# For backwards compatibility
|
||||
reverts = detect
|
||||
@@ -0,0 +1,33 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..detector import Detector
|
||||
|
||||
|
||||
def test_detector():
|
||||
detector = Detector(2)
|
||||
|
||||
eq_(detector.process("a", {'id': 1}), None)
|
||||
|
||||
# Check noop
|
||||
eq_(detector.process("a", {'id': 2}), None)
|
||||
|
||||
# Short revert
|
||||
eq_(detector.process("b", {'id': 3}), None)
|
||||
eq_(
|
||||
detector.process("a", {'id': 4}),
|
||||
({'id': 4}, [{'id': 3}], {'id': 2})
|
||||
)
|
||||
|
||||
# Medium revert
|
||||
eq_(detector.process("c", {'id': 5}), None)
|
||||
eq_(detector.process("d", {'id': 6}), None)
|
||||
eq_(
|
||||
detector.process("a", {'id': 7}),
|
||||
({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
|
||||
)
|
||||
|
||||
# Long (undetected) revert
|
||||
eq_(detector.process("e", {'id': 8}), None)
|
||||
eq_(detector.process("f", {'id': 9}), None)
|
||||
eq_(detector.process("g", {'id': 10}), None)
|
||||
eq_(detector.process("a", {'id': 11}), None)
|
||||
@@ -0,0 +1,23 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..functions import reverts
|
||||
|
||||
|
||||
def test_reverts():
|
||||
checksum_revisions = [
|
||||
("a", {'id': 1}),
|
||||
("b", {'id': 2}),
|
||||
("c", {'id': 3}),
|
||||
("a", {'id': 4}),
|
||||
("d", {'id': 5}),
|
||||
("b", {'id': 6}),
|
||||
("a", {'id': 7})
|
||||
]
|
||||
|
||||
expected = [
|
||||
({'id': 4}, [{'id': 3}, {'id': 2}], {'id': 1}),
|
||||
({'id': 7}, [{'id': 6}, {'id': 5}], {'id': 4})
|
||||
]
|
||||
|
||||
for revert in reverts(checksum_revisions, radius=2):
|
||||
eq_(revert, expected.pop(0))
|
||||
@@ -0,0 +1,4 @@
|
||||
from .functions import cluster, sessions
|
||||
from .event import Event
|
||||
from .cache import Cache, Session
|
||||
from . import defaults
|
||||
@@ -0,0 +1,121 @@
|
||||
import logging
|
||||
from collections import namedtuple
|
||||
|
||||
from ...util import Heap
|
||||
from ...types import Timestamp
|
||||
from . import defaults
|
||||
from .event import Event, unpack_events
|
||||
|
||||
|
||||
logger = logging.getLogger("mw.lib.sessions.cache")
|
||||
|
||||
Session = namedtuple("Session", ["user", "events"])
|
||||
"""
|
||||
Represents a user session (a cluster over events for a user). This class
|
||||
behaves like :class:`collections.namedtuple`. Note that the datatypes of
|
||||
`events`, is not specified since those types will depend on the revision data
|
||||
provided during revert detection.
|
||||
|
||||
:Members:
|
||||
**user**
|
||||
A hashable user identifier : `hashable`
|
||||
**events**
|
||||
A list of event data : list( `mixed` )
|
||||
"""
|
||||
|
||||
|
||||
class Cache:
|
||||
"""
|
||||
A cache of recent user session. Since sessions expire once activities stop
|
||||
for at least `cutoff` seconds, this class manages a cache of *active*
|
||||
sessions.
|
||||
|
||||
:Parameters:
|
||||
cutoff : int
|
||||
Maximum amount of time in seconds between session events
|
||||
|
||||
:Example:
|
||||
>>> from mw.lib import sessions
|
||||
>>>
|
||||
>>> cache = sessions.Cache(cutoff=3600)
|
||||
>>>
|
||||
>>> list(cache.process("Willy on wheels", 100000, {'rev_id': 1}))
|
||||
[]
|
||||
>>> list(cache.process("Walter", 100001, {'rev_id': 2}))
|
||||
[]
|
||||
>>> list(cache.process("Willy on wheels", 100001, {'rev_id': 3}))
|
||||
[]
|
||||
>>> list(cache.process("Walter", 100035, {'rev_id': 4}))
|
||||
[]
|
||||
>>> list(cache.process("Willy on wheels", 103602, {'rev_id': 5}))
|
||||
[Session(user='Willy on wheels', events=[{'rev_id': 1}, {'rev_id': 3}])]
|
||||
>>> list(cache.get_active_sessions())
|
||||
[Session(user='Walter', events=[{'rev_id': 2}, {'rev_id': 4}]), Session(user='Willy on wheels', events=[{'rev_id': 5}])]
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, cutoff=defaults.CUTOFF):
|
||||
self.cutoff = int(cutoff)
|
||||
|
||||
self.recently_active = Heap()
|
||||
self.active_users = {}
|
||||
|
||||
def process(self, user, timestamp, data=None):
|
||||
"""
|
||||
Processes a user event.
|
||||
|
||||
:Parameters:
|
||||
user : `hashable`
|
||||
A hashable value to identify a user (`int` or `str` are OK)
|
||||
timestamp : :class:`mw.Timestamp`
|
||||
The timestamp of the event
|
||||
data : `mixed`
|
||||
Event meta data
|
||||
|
||||
:Returns:
|
||||
A generator of :class:`~mw.lib.sessions.Session` expired after
|
||||
processing the user event.
|
||||
"""
|
||||
event = Event(user, Timestamp(timestamp), data)
|
||||
|
||||
for user, events in self._clear_expired(event.timestamp):
|
||||
yield Session(user, unpack_events(events))
|
||||
|
||||
# Apply revision
|
||||
if event.user in self.active_users:
|
||||
events = self.active_users[event.user]
|
||||
else:
|
||||
events = []
|
||||
self.active_users[event.user] = events
|
||||
self.recently_active.push((event.timestamp, events))
|
||||
|
||||
events.append(event)
|
||||
|
||||
def get_active_sessions(self):
|
||||
"""
|
||||
Retrieves the active, unexpired sessions.
|
||||
|
||||
:Returns:
|
||||
A generator of :class:`~mw.lib.sessions.Session`
|
||||
|
||||
"""
|
||||
for last_timestamp, events in self.recently_active:
|
||||
yield Session(events[-1].user, unpack_events(events))
|
||||
|
||||
def _clear_expired(self, timestamp):
|
||||
|
||||
# Cull old sessions
|
||||
while (len(self.recently_active) > 0 and
|
||||
timestamp - self.recently_active.peek()[0] >= self.cutoff):
|
||||
|
||||
_, events = self.recently_active.pop()
|
||||
|
||||
if timestamp - events[-1].timestamp >= self.cutoff:
|
||||
del self.active_users[events[-1].user]
|
||||
yield events[-1].user, events
|
||||
else:
|
||||
self.recently_active.push((events[-1].timestamp, events))
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(%s)".format(self.__class__.__name__, repr(self.cutoff))
|
||||
@@ -0,0 +1,6 @@
|
||||
CUTOFF = 60 * 60
|
||||
"""
|
||||
TODO: Better documentation here.
|
||||
For the time being, see
|
||||
`<https://meta.wikimedia.org/wiki/Research:Edit_session>`_
|
||||
"""
|
||||
@@ -0,0 +1,19 @@
|
||||
import logging
|
||||
from collections import namedtuple
|
||||
|
||||
logger = logging.getLogger("mw.lib.sessions.event")
|
||||
|
||||
|
||||
# class Event:
|
||||
# __slots__ = ('user', 'timestamp', 'data')
|
||||
#
|
||||
# def __init__(self, user, timestamp, data=None):
|
||||
# self.user = user
|
||||
# self.timestamp = Timestamp(timestamp)
|
||||
# self.data = data
|
||||
|
||||
Event = namedtuple("Event", ['user', 'timestamp', 'data'])
|
||||
|
||||
|
||||
def unpack_events(events):
|
||||
return list(e.data for e in events)
|
||||
@@ -0,0 +1,68 @@
|
||||
import logging
|
||||
|
||||
from .cache import Cache
|
||||
from . import defaults
|
||||
|
||||
logger = logging.getLogger("mw.lib.sessions.functions")
|
||||
|
||||
|
||||
def cluster(user_events, cutoff=defaults.CUTOFF):
|
||||
"""
|
||||
Clusters user sessions from a sequence of user events. Note that,
|
||||
`event` data will simply be returned in the case of a revert.
|
||||
|
||||
This function serves as a convenience wrapper around calls to
|
||||
:class:`~mw.lib.sessions.Cache`'s :meth:`~mw.lib.sessions.Cache.process`
|
||||
method.
|
||||
|
||||
:Parameters:
|
||||
user_events : iter( (user, timestamp, event) )
|
||||
an iterable over tuples of user, timestamp and event data.
|
||||
|
||||
* user : `hashable`
|
||||
* timestamp : :class:`mw.Timestamp`
|
||||
* event : `mixed`
|
||||
|
||||
cutoff : int
|
||||
the maximum time between events within a user session
|
||||
|
||||
:Returns:
|
||||
a iterator over :class:`~mw.lib.sessions.Session`
|
||||
|
||||
:Example:
|
||||
>>> from mw.lib import sessions
|
||||
>>>
|
||||
>>> user_events = [
|
||||
... ("Willy on wheels", 100000, {'rev_id': 1}),
|
||||
... ("Walter", 100001, {'rev_id': 2}),
|
||||
... ("Willy on wheels", 100001, {'rev_id': 3}),
|
||||
... ("Walter", 100035, {'rev_id': 4}),
|
||||
... ("Willy on wheels", 103602, {'rev_id': 5})
|
||||
... ]
|
||||
>>>
|
||||
>>> for user, events in sessions.cluster(user_events):
|
||||
... (user, events)
|
||||
...
|
||||
('Willy on wheels', [{'rev_id': 1}, {'rev_id': 3}])
|
||||
('Walter', [{'rev_id': 2}, {'rev_id': 4}])
|
||||
('Willy on wheels', [{'rev_id': 5}])
|
||||
|
||||
|
||||
"""
|
||||
|
||||
# Construct the session manager
|
||||
cache = Cache(cutoff)
|
||||
|
||||
# Apply the events
|
||||
for user, timestamp, event in user_events:
|
||||
|
||||
for session in cache.process(user, timestamp, event):
|
||||
yield session
|
||||
|
||||
# Yield the left-overs
|
||||
for session in cache.get_active_sessions():
|
||||
yield session
|
||||
|
||||
|
||||
# For backwards compatibility
|
||||
sessions = cluster
|
||||
@@ -0,0 +1,22 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..cache import Cache
|
||||
|
||||
|
||||
def test_session_manager():
|
||||
cache = Cache(cutoff=2)
|
||||
|
||||
user_sessions = list(cache.process("foo", 1))
|
||||
eq_(user_sessions, [])
|
||||
|
||||
user_sessions = list(cache.process("bar", 2))
|
||||
eq_(user_sessions, [])
|
||||
|
||||
user_sessions = list(cache.process("foo", 2))
|
||||
eq_(user_sessions, [])
|
||||
|
||||
user_sessions = list(cache.process("bar", 10))
|
||||
eq_(len(user_sessions), 2)
|
||||
|
||||
user_sessions = list(cache.get_active_sessions())
|
||||
eq_(len(user_sessions), 1)
|
||||
@@ -0,0 +1,50 @@
|
||||
from itertools import chain
|
||||
|
||||
from nose.tools import eq_
|
||||
from .. import defaults
|
||||
from ..functions import sessions
|
||||
|
||||
|
||||
EVENTS = {
|
||||
"foo": [
|
||||
[
|
||||
("foo", 1234567890, 1),
|
||||
("foo", 1234567892, 2),
|
||||
("foo", 1234567894, 3)
|
||||
],
|
||||
[
|
||||
("foo", 1234567894 + defaults.CUTOFF, 4),
|
||||
("foo", 1234567897 + defaults.CUTOFF, 5)
|
||||
]
|
||||
],
|
||||
"bar": [
|
||||
[
|
||||
("bar", 1234567891, 6),
|
||||
("bar", 1234567892, 7),
|
||||
("bar", 1234567893, 8)
|
||||
],
|
||||
[
|
||||
("bar", 1234567895 + defaults.CUTOFF, 9),
|
||||
("bar", 1234567898 + defaults.CUTOFF, 0)
|
||||
]
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
def test_group_events():
|
||||
events = []
|
||||
events.extend(chain(*EVENTS['foo']))
|
||||
events.extend(chain(*EVENTS['bar']))
|
||||
|
||||
events.sort()
|
||||
|
||||
user_sessions = sessions(events)
|
||||
|
||||
counts = {
|
||||
'foo': 0,
|
||||
'bar': 0
|
||||
}
|
||||
|
||||
for user, session in user_sessions:
|
||||
eq_(list(e[2] for e in EVENTS[user][counts[user]]), list(session))
|
||||
counts[user] += 1
|
||||
@@ -0,0 +1,2 @@
|
||||
from .functions import normalize
|
||||
from .parser import Parser
|
||||
@@ -0,0 +1,25 @@
|
||||
def normalize(title):
|
||||
"""
|
||||
Normalizes a page title to the database format. E.g. spaces are converted
|
||||
to underscores and the first character in the title is converted to
|
||||
upper-case.
|
||||
|
||||
:Parameters:
|
||||
title : str
|
||||
A page title
|
||||
:Returns:
|
||||
The normalized title.
|
||||
:Example:
|
||||
>>> from mw.lib import title
|
||||
>>>
|
||||
>>> title.normalize("foo bar")
|
||||
'Foo_bar'
|
||||
|
||||
"""
|
||||
if title is None:
|
||||
return title
|
||||
else:
|
||||
if len(title) > 0:
|
||||
return (title[0].upper() + title[1:]).replace(" ", "_")
|
||||
else:
|
||||
return ""
|
||||
171
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py
Normal file
171
mediawiki_dump_tools/Mediawiki-Utilities/mw/lib/title/parser.py
Normal file
@@ -0,0 +1,171 @@
|
||||
from ...types import Namespace
|
||||
from ...util import autovivifying, none_or
|
||||
from .functions import normalize
|
||||
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
Constructs a page name parser from a set of :class:`mw.Namespace`. Such a
|
||||
parser can be used to convert a full page name (namespace included with a
|
||||
colon; e.g, ``"Talk:Foo"``) into a namespace ID and
|
||||
:func:`mw.lib.title.normalize`'d page title (e.g., ``(1, "Foo")``).
|
||||
|
||||
:Parameters:
|
||||
namespaces : set( :class:`mw.Namespace` )
|
||||
:Example:
|
||||
>>> from mw import Namespace
|
||||
>>> from mw.lib import title
|
||||
>>>
|
||||
>>> parser = title.Parser(
|
||||
... [
|
||||
... Namespace(0, "", case="first-letter"),
|
||||
... Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
|
||||
... Namespace(2, "Usu\u00e1rio(a)", canonical="User", aliases={"U"}, case="first-letter")
|
||||
... ]
|
||||
... )
|
||||
>>>
|
||||
>>> parser.parse("Discuss\u00e3o:Foo") # Using the standard name
|
||||
(1, 'Foo')
|
||||
>>> parser.parse("Talk:Foo bar") # Using the cannonical name
|
||||
(1, 'Foo_bar')
|
||||
>>> parser.parse("U:Foo bar") # Using an alias
|
||||
(2, 'Foo_bar')
|
||||
>>> parser.parse("Herpderp:Foo bar") # Psuedo namespace
|
||||
(0, 'Herpderp:Foo_bar')
|
||||
"""
|
||||
|
||||
def __init__(self, namespaces=None):
|
||||
namespaces = none_or(namespaces, set)
|
||||
|
||||
self.ids = {}
|
||||
self.names = {}
|
||||
|
||||
if namespaces is not None:
|
||||
for namespace in namespaces:
|
||||
self.add_namespace(namespace)
|
||||
|
||||
def parse(self, page_name):
|
||||
"""
|
||||
Parses a page name to extract the namespace.
|
||||
|
||||
:Parameters:
|
||||
page_name : str
|
||||
A page name including the namespace prefix and a colon (if not Main)
|
||||
|
||||
:Returns:
|
||||
A tuple of (namespace : `int`, title : `str`)
|
||||
"""
|
||||
parts = page_name.split(":", 1)
|
||||
if len(parts) == 1:
|
||||
ns_id = 0
|
||||
title = normalize(page_name)
|
||||
else:
|
||||
ns_name, title = parts
|
||||
ns_name, title = normalize(ns_name), normalize(title)
|
||||
|
||||
if self.contains_name(ns_name):
|
||||
ns_id = self.get_namespace(name=ns_name).id
|
||||
else:
|
||||
ns_id = 0
|
||||
title = normalize(page_name)
|
||||
|
||||
return ns_id, title
|
||||
|
||||
def add_namespace(self, namespace):
|
||||
"""
|
||||
Adds a namespace to the parser.
|
||||
|
||||
:Parameters:
|
||||
namespace : :class:`mw.Namespace`
|
||||
A namespace
|
||||
"""
|
||||
self.ids[namespace.id] = namespace
|
||||
self.names[namespace.name] = namespace
|
||||
|
||||
for alias in namespace.aliases:
|
||||
self.names[alias] = namespace
|
||||
|
||||
if namespace.canonical is not None:
|
||||
self.names[namespace.canonical] = namespace
|
||||
|
||||
def contains_name(self, name):
|
||||
return normalize(name) in self.names
|
||||
|
||||
def get_namespace(self, id=None, name=None):
|
||||
"""
|
||||
Gets a namespace from the parser. Throws a :class:`KeyError` if a
|
||||
namespace cannot be found.
|
||||
|
||||
:Parameters:
|
||||
id : int
|
||||
A namespace ID
|
||||
name : str
|
||||
A namespace name (standard, cannonical names and aliases
|
||||
will be searched)
|
||||
:Returns:
|
||||
A :class:`mw.Namespace`.
|
||||
"""
|
||||
if id is not None:
|
||||
return self.ids[int(id)]
|
||||
else:
|
||||
return self.names[normalize(name)]
|
||||
|
||||
@classmethod
|
||||
def from_site_info(cls, si_doc):
|
||||
"""
|
||||
Constructs a parser from the result of a :meth:`mw.api.SiteInfo.query`.
|
||||
|
||||
:Parameters:
|
||||
si_doc : dict
|
||||
The result of a site_info request.
|
||||
|
||||
:Returns:
|
||||
An initialized :class:`mw.lib.title.Parser`
|
||||
"""
|
||||
aliases = autovivifying.Dict(vivifier=lambda k: [])
|
||||
# get aliases
|
||||
if 'namespacealiases' in si_doc:
|
||||
for alias_doc in si_doc['namespacealiases']:
|
||||
aliases[alias_doc['id']].append(alias_doc['*'])
|
||||
|
||||
namespaces = []
|
||||
for ns_doc in si_doc['namespaces'].values():
|
||||
namespaces.append(
|
||||
Namespace.from_doc(ns_doc, aliases)
|
||||
)
|
||||
|
||||
return Parser(namespaces)
|
||||
|
||||
@classmethod
|
||||
def from_api(cls, session):
|
||||
"""
|
||||
Constructs a parser from a :class:`mw.api.Session`
|
||||
|
||||
:Parameters:
|
||||
session : :class:`mw.api.Session`
|
||||
An open API session
|
||||
|
||||
:Returns:
|
||||
An initialized :class:`mw.lib.title.Parser`
|
||||
"""
|
||||
si_doc = session.site_info.query(
|
||||
properties={'namespaces', 'namespacealiases'}
|
||||
)
|
||||
|
||||
return cls.from_site_info(si_doc)
|
||||
|
||||
@classmethod
|
||||
def from_dump(cls, dump):
|
||||
"""
|
||||
Constructs a parser from a :class:`mw.xml_dump.Iterator`. Note that
|
||||
XML database dumps do not include namespace aliases or cannonical names
|
||||
so the parser that will be constructed will only work in common cases.
|
||||
|
||||
:Parameters:
|
||||
dump : :class:`mw.xml_dump.Iterator`
|
||||
An XML dump iterator
|
||||
|
||||
:Returns:
|
||||
An initialized :class:`mw.lib.title.Parser`
|
||||
"""
|
||||
return cls(dump.namespaces)
|
||||
@@ -0,0 +1,10 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..functions import normalize
|
||||
|
||||
|
||||
def test_normalize():
|
||||
eq_("Foobar", normalize("Foobar")) # Same
|
||||
eq_("Foobar", normalize("foobar")) # Capitalize
|
||||
eq_("FooBar", normalize("fooBar")) # Late capital
|
||||
eq_("Foo_bar", normalize("Foo bar")) # Space
|
||||
@@ -0,0 +1,58 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ....types import Namespace
|
||||
from ..parser import Parser
|
||||
|
||||
|
||||
def test_simple():
|
||||
parser = Parser(
|
||||
[
|
||||
Namespace(0, "", case="first-letter"),
|
||||
Namespace(1, "Discuss\u00e3o", canonical="Talk", case="first-letter"),
|
||||
Namespace(2, "Usu\u00e1rio(a)", canonical="User", case="first-letter")
|
||||
]
|
||||
)
|
||||
|
||||
eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
|
||||
eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
|
||||
eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
|
||||
|
||||
|
||||
def test_from_site_info():
|
||||
parser = Parser.from_site_info(
|
||||
{
|
||||
"namespaces": {
|
||||
"0": {
|
||||
"id": 0,
|
||||
"case": "first-letter",
|
||||
"*": "",
|
||||
"content": ""
|
||||
},
|
||||
"1": {
|
||||
"id": 1,
|
||||
"case": "first-letter",
|
||||
"*": "Discuss\u00e3o",
|
||||
"subpages": "",
|
||||
"canonical": "Talk"
|
||||
},
|
||||
"2": {
|
||||
"id": 2,
|
||||
"case": "first-letter",
|
||||
"*": "Usu\u00e1rio(a)",
|
||||
"subpages": "",
|
||||
"canonical": "User"
|
||||
}
|
||||
},
|
||||
"namespacealiases": [
|
||||
{
|
||||
"id": 1,
|
||||
"*": "WAFFLES"
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
eq_((1, "Foo"), parser.parse("Discuss\u00e3o:Foo"))
|
||||
eq_((1, "Foo_bar"), parser.parse("Discuss\u00e3o:Foo bar"))
|
||||
eq_((0, "Herpderp:Foo_bar"), parser.parse("Herpderp:Foo bar"))
|
||||
eq_((1, "Foo_bar"), parser.parse("WAFFLES:Foo bar"))
|
||||
Reference in New Issue
Block a user