Initial commit
p# new file: runwikiq.sh
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
from .functions import none_or
|
||||
from .heap import Heap
|
||||
53
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/api.py
Normal file
53
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/api.py
Normal file
@@ -0,0 +1,53 @@
|
||||
import logging
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
from .functions import none_or
|
||||
|
||||
logger = logging.getLogger("mw.util.api.session")
|
||||
|
||||
FAILURE_THRESHOLD = 5
|
||||
TIMEOUT = 20
|
||||
|
||||
|
||||
class Session:
|
||||
def __init__(self, uri, headers=None, timeout=None,
|
||||
failure_threshold=None, wait_step=2):
|
||||
if uri is None:
|
||||
raise TypeError("uri must not be None")
|
||||
|
||||
self.uri = str(uri)
|
||||
self.headers = headers if headers is not None else {}
|
||||
self.session = requests.Session()
|
||||
|
||||
self.failure_threshold = int(failure_threshold or FAILURE_THRESHOLD)
|
||||
self.timeout = float(TIMEOUT)
|
||||
self.wait_step = float(wait_step)
|
||||
|
||||
self.failed = 0
|
||||
|
||||
def __sleep(self):
|
||||
time.sleep(self.failed * (self.wait_step ** self.failed))
|
||||
|
||||
def get(self, params, **kwargs):
|
||||
return self.request('GET', params, **kwargs)
|
||||
|
||||
def post(self, params, **kwargs):
|
||||
return self.request('POST', params, **kwargs)
|
||||
|
||||
def request(self, type, params):
|
||||
try:
|
||||
result = self.session.request(type, self.uri, params=params,
|
||||
timeout=self.timeout)
|
||||
self.failed = 0
|
||||
return result
|
||||
except (requests.HTTPError, requests.ConnectionError):
|
||||
self.failed += 1
|
||||
|
||||
if self.failed > self.failure_threshold:
|
||||
self.failed = 0
|
||||
raise
|
||||
else:
|
||||
self.__sleep()
|
||||
self.request(type, params)
|
||||
@@ -0,0 +1,11 @@
|
||||
class Dict(dict):
|
||||
def __init__(self, *args, vivifier=lambda k: None, **kwargs):
|
||||
self.vivifier = vivifier
|
||||
|
||||
dict.__init__(self, *args, **kwargs)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if key not in self:
|
||||
dict.__setitem__(self, key, self.vivifier(key))
|
||||
|
||||
return dict.__getitem__(self, key)
|
||||
@@ -0,0 +1,21 @@
|
||||
def none_or(val, func=None, levels=None):
|
||||
if val is None:
|
||||
return None
|
||||
else:
|
||||
if levels is not None:
|
||||
if val not in set(levels):
|
||||
raise KeyError(val)
|
||||
|
||||
return val
|
||||
else:
|
||||
return func(val)
|
||||
|
||||
|
||||
def try_keys(dictionary, keys):
|
||||
attempted_keys = []
|
||||
for key in keys:
|
||||
if key in dictionary:
|
||||
return dictionary[key]
|
||||
attempted_keys.append(key)
|
||||
|
||||
raise KeyError("|".join(str(k) for k in attempted_keys))
|
||||
22
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/heap.py
Normal file
22
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/heap.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import heapq
|
||||
|
||||
|
||||
class Heap(list):
|
||||
def __init__(self, *args, **kwargs):
|
||||
list.__init__(self, *args, **kwargs)
|
||||
heapq.heapify(self)
|
||||
|
||||
def pop(self):
|
||||
return heapq.heappop(self)
|
||||
|
||||
def push(self, item):
|
||||
heapq.heappush(self, item)
|
||||
|
||||
def peek(self):
|
||||
return self[0]
|
||||
|
||||
def pushpop(self, item):
|
||||
return heapq.heappushpop(self, item)
|
||||
|
||||
def poppush(self, itemp):
|
||||
return heapq.replace(self, item)
|
||||
@@ -0,0 +1,3 @@
|
||||
from .aggregate import aggregate, group
|
||||
from .peekable import Peekable
|
||||
from .sequence import sequence
|
||||
@@ -0,0 +1,20 @@
|
||||
from .peekable import Peekable
|
||||
|
||||
|
||||
def group(it, by=lambda i: i):
|
||||
return aggregate(it, by)
|
||||
|
||||
|
||||
def aggregate(it, by=lambda i: i):
|
||||
it = Peekable(it)
|
||||
|
||||
def chunk(it, by):
|
||||
identifier = by(it.peek())
|
||||
while not it.empty():
|
||||
if identifier == by(it.peek()):
|
||||
yield next(it)
|
||||
else:
|
||||
break
|
||||
|
||||
while not it.empty():
|
||||
yield (by(it.peek()), chunk(it, by))
|
||||
@@ -0,0 +1,8 @@
|
||||
def count(iterable):
|
||||
"""
|
||||
Consumes all items in an iterable and returns a count.
|
||||
"""
|
||||
n = 0
|
||||
for item in iterable:
|
||||
n += 1
|
||||
return n
|
||||
@@ -0,0 +1,37 @@
|
||||
def Peekable(it):
|
||||
if isinstance(it, PeekableType):
|
||||
return it
|
||||
else:
|
||||
return PeekableType(it)
|
||||
|
||||
|
||||
class PeekableType:
|
||||
class EMPTY:
|
||||
pass
|
||||
|
||||
def __init__(self, it):
|
||||
self.it = iter(it)
|
||||
self.__cycle()
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __cycle(self):
|
||||
try:
|
||||
self.lookahead = next(self.it)
|
||||
except StopIteration:
|
||||
self.lookahead = self.EMPTY
|
||||
|
||||
def __next__(self):
|
||||
item = self.peek()
|
||||
self.__cycle()
|
||||
return item
|
||||
|
||||
def peek(self):
|
||||
if self.empty():
|
||||
raise StopIteration()
|
||||
else:
|
||||
return self.lookahead
|
||||
|
||||
def empty(self):
|
||||
return self.lookahead == self.EMPTY
|
||||
@@ -0,0 +1,27 @@
|
||||
from .peekable import Peekable
|
||||
|
||||
|
||||
def sequence(*iterables, by=None, compare=None):
|
||||
if compare is not None:
|
||||
compare = compare
|
||||
elif by is not None:
|
||||
compare = lambda i1, i2: by(i1) <= by(i2)
|
||||
else:
|
||||
compare = lambda i1, i2: i1 <= i2
|
||||
|
||||
iterables = [Peekable(it) for it in iterables]
|
||||
|
||||
done = False
|
||||
while not done:
|
||||
|
||||
next_i = None
|
||||
|
||||
for i, it in enumerate(iterables):
|
||||
if not it.empty():
|
||||
if next_i is None or compare(it.peek(), iterables[next_i].peek()):
|
||||
next_i = i
|
||||
|
||||
if next_i is None:
|
||||
done = True
|
||||
else:
|
||||
yield next(iterables[next_i])
|
||||
@@ -0,0 +1,13 @@
|
||||
from nose.tools import eq_
|
||||
from ..aggregate import aggregate
|
||||
|
||||
|
||||
def test_group():
|
||||
l = [0, 1, 2, 3, 4, 5, 10, 11, 12, 13, 14]
|
||||
expected = [[0, 1, 2, 3, 4, 5], [10, 11, 12, 13, 14]]
|
||||
|
||||
result = []
|
||||
for identifier, group in aggregate(l, lambda item: int(item / 10)):
|
||||
result.append(list(group))
|
||||
|
||||
eq_(result, expected)
|
||||
@@ -0,0 +1,20 @@
|
||||
from nose.tools import eq_
|
||||
from ..peekable import Peekable
|
||||
|
||||
|
||||
def test_peekable():
|
||||
iterable = range(0, 100)
|
||||
iterable = Peekable(iterable)
|
||||
expected = list(range(0, 100))
|
||||
|
||||
result = []
|
||||
|
||||
assert not iterable.empty()
|
||||
eq_(iterable.peek(), expected[0])
|
||||
result.append(next(iterable))
|
||||
|
||||
eq_(iterable.peek(), expected[1])
|
||||
result.append(next(iterable))
|
||||
|
||||
result.extend(list(iterable))
|
||||
eq_(result, expected)
|
||||
@@ -0,0 +1,11 @@
|
||||
from nose.tools import eq_
|
||||
from ..sequence import sequence
|
||||
|
||||
|
||||
def test_sequence():
|
||||
foo = [{'val': 3}, {'val': 5}]
|
||||
bar = [{'val': 1}, {'val': 10}, {'val': 15}]
|
||||
expected = [{'val': 1}, {'val': 3}, {'val': 5}, {'val': 10}, {'val': 15}]
|
||||
|
||||
result = list(sequence(foo, bar, compare=lambda i1, i2: i1['val'] < i2['val']))
|
||||
eq_(expected, result)
|
||||
116
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/ordered.py
Normal file
116
mediawiki_dump_tools/Mediawiki-Utilities/mw/util/ordered.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from . import autovivifying
|
||||
|
||||
|
||||
class Circle(list):
|
||||
def __init__(self, maxsize, iterable=None):
|
||||
self._maxsize = int(maxsize)
|
||||
list.__init__(self, [None] * maxsize)
|
||||
self._size = 0
|
||||
self._pointer = 0
|
||||
|
||||
if iterable is not None:
|
||||
self.extend(iterable)
|
||||
|
||||
def state(self):
|
||||
return list(list.__iter__(self))
|
||||
|
||||
def _internalize(self, index):
|
||||
if self._size < self._maxsize:
|
||||
return index
|
||||
else:
|
||||
return (self._pointer + index) % self._maxsize
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(0, self._size):
|
||||
yield list.__getitem__(self, self._internalize(i))
|
||||
|
||||
def __reversed__(self):
|
||||
for i in range(self._size - 1, -1, -1):
|
||||
yield list.__getitem__(self, self._internalize(i))
|
||||
|
||||
def pop(self, index=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
def __len__(self):
|
||||
return self._size
|
||||
|
||||
def __getitem__(self, index):
|
||||
return list.__getitem__(self, self._internalize(index))
|
||||
|
||||
def append(self, value):
|
||||
# Get the old value
|
||||
old_value = list.__getitem__(self, self._pointer)
|
||||
|
||||
# Update internal list
|
||||
list.__setitem__(self, self._pointer, value)
|
||||
|
||||
# Update state
|
||||
self._pointer = (self._pointer + 1) % self._maxsize
|
||||
self._size = min(self._maxsize, self._size + 1)
|
||||
|
||||
# If we overwrote a value, yield it.
|
||||
return old_value
|
||||
|
||||
def extend(self, values):
|
||||
for value in values:
|
||||
expectorate = self.append(value)
|
||||
if expectorate is not None or self._size == self._maxsize:
|
||||
yield expectorate
|
||||
|
||||
|
||||
class HistoricalMap(autovivifying.Dict):
|
||||
'''
|
||||
A datastructure for efficiently storing and retrieving a
|
||||
limited number of historical records.
|
||||
|
||||
TODO: Rename this to FIFOCache
|
||||
'''
|
||||
|
||||
def __init__(self, *args, maxlen, **kwargs):
|
||||
'''Maxlen specifies the maximum amount of history to keep'''
|
||||
super().__init__(self, *args, vivifier=lambda k: [], **kwargs)
|
||||
|
||||
self._circle = Circle(maxlen) # List to preserve order for history
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._circle)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
'''Adds a new key-value pair. Returns any discarded values.'''
|
||||
|
||||
# Add to history circle and catch expectorate
|
||||
expectorate = self._circle.append((key, value))
|
||||
|
||||
autovivifying.Dict.__getitem__(self, key).append(value)
|
||||
|
||||
if expectorate is not None:
|
||||
old_key, old_value = expectorate
|
||||
autovivifying.Dict.__getitem__(self, old_key).pop(0)
|
||||
if len(autovivifying.Dict.__getitem__(self, old_key)) == 0:
|
||||
autovivifying.Dict.__delitem__(self, old_key)
|
||||
|
||||
return (old_key, old_value)
|
||||
|
||||
def insert(self, key, value):
|
||||
return self.__setitem__(key, value)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if key in self:
|
||||
return autovivifying.Dict.__getitem__(self, key)[-1]
|
||||
else:
|
||||
raise KeyError(key)
|
||||
|
||||
def get(self, key):
|
||||
'''Gets the most recently added value for a key'''
|
||||
return self.__getitem__(key)
|
||||
|
||||
def up_to(self, key):
|
||||
'''Gets the recently inserted values up to a key'''
|
||||
for okey, ovalue in reversed(self._circle):
|
||||
if okey == key:
|
||||
break
|
||||
else:
|
||||
yield ovalue
|
||||
|
||||
def last(self):
|
||||
return self.circle[-1]
|
||||
@@ -0,0 +1,28 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from .. import autovivifying
|
||||
|
||||
|
||||
def test_word_count():
|
||||
words = """
|
||||
I am a little teapot short and stout. Here is my handle and here is my
|
||||
spout. The red fox jumps over the lazy brown dog. She sells sea shells
|
||||
by the sea shore.
|
||||
""".replace(".", " ").split()
|
||||
|
||||
# Lame way
|
||||
lame_counts = {}
|
||||
for word in words:
|
||||
if word not in lame_counts:
|
||||
lame_counts[word] = 0
|
||||
|
||||
lame_counts[word] += 1
|
||||
|
||||
# Awesome way
|
||||
awesome_counts = autovivifying.Dict( # Autovivifies entries with zero.
|
||||
vivifier=lambda k: 0 # Useful for counting.
|
||||
)
|
||||
for word in words:
|
||||
awesome_counts[word] += 1
|
||||
|
||||
eq_(lame_counts, awesome_counts)
|
||||
@@ -0,0 +1,14 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..functions import none_or
|
||||
|
||||
|
||||
def test_none_or():
|
||||
eq_(10, none_or("10", int))
|
||||
eq_(10.75, none_or("10.75", float))
|
||||
eq_(None, none_or(None, int))
|
||||
assert none_or("", str) is not None
|
||||
assert none_or([], list) is not None
|
||||
assert none_or({}, dict) is not None
|
||||
assert none_or(0, int) is not None
|
||||
assert none_or(-1, int) is not None
|
||||
@@ -0,0 +1,28 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from ..heap import Heap
|
||||
|
||||
|
||||
def test_heap():
|
||||
h = Heap([5, 4, 7, 8, 2])
|
||||
eq_(h.pop(), 2)
|
||||
eq_(h.pop(), 4)
|
||||
eq_(h.pop(), 5)
|
||||
eq_(h.pop(), 7)
|
||||
eq_(h.pop(), 8)
|
||||
eq_(len(h), 0)
|
||||
|
||||
h = Heap([10, 20, 100])
|
||||
eq_(h.pop(), 10)
|
||||
h.push(30)
|
||||
eq_(len(h), 3)
|
||||
eq_(h.pop(), 20)
|
||||
eq_(h.pop(), 30)
|
||||
eq_(h.pop(), 100)
|
||||
eq_(len(h), 0)
|
||||
|
||||
h = Heap([(1, 7), (2, 4), (10, -100)])
|
||||
eq_(h.peek(), (1, 7))
|
||||
h.pop()
|
||||
eq_(h.pop(), (2, 4))
|
||||
eq_(h.pop(), (10, -100))
|
||||
@@ -0,0 +1,41 @@
|
||||
from nose.tools import eq_
|
||||
|
||||
from .. import ordered
|
||||
|
||||
|
||||
def test_circle():
|
||||
circle = ordered.Circle(3)
|
||||
|
||||
eq_(0, len(circle))
|
||||
print(circle.state())
|
||||
eq_(None, circle.append(5))
|
||||
eq_(1, len(circle))
|
||||
print(circle.state())
|
||||
eq_(None, circle.append(6))
|
||||
eq_(2, len(circle))
|
||||
print(circle.state())
|
||||
eq_(None, circle.append(7))
|
||||
eq_(3, len(circle))
|
||||
print(circle.state())
|
||||
eq_(5, circle.append(8))
|
||||
eq_(3, len(circle))
|
||||
print(circle.state())
|
||||
|
||||
eq_([6, 7, 8], list(circle))
|
||||
|
||||
print(circle.state())
|
||||
eq_([8, 7, 6], list(reversed(circle)))
|
||||
|
||||
|
||||
def test_historical_map():
|
||||
hist = ordered.HistoricalMap(maxlen=2)
|
||||
|
||||
assert "foo" not in hist
|
||||
|
||||
eq_(None, hist.insert('foo', "bar1"))
|
||||
|
||||
assert "foo" in hist
|
||||
|
||||
eq_(None, hist.insert('foo', "bar2"))
|
||||
|
||||
eq_(('foo', "bar1"), hist.insert('not_foo', "not_bar"))
|
||||
Reference in New Issue
Block a user