1
0

Initial commit

p#	new file:   runwikiq.sh
This commit is contained in:
2018-06-02 15:32:19 -07:00
commit 72633c193b
202 changed files with 21929 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
# from . import errors
from .db import DB
from .collections import Pages, RecentChanges, Revisions, Archives, \
AllRevisions, Users

View File

@@ -0,0 +1,4 @@
from .pages import Pages
from .recent_changes import RecentChanges
from .revisions import Revisions, Archives, AllRevisions
from .users import Users

View File

@@ -0,0 +1,11 @@
class Collection:
DIRECTIONS = {'newer', 'older'}
def __init__(self, db):
self.db = db
def __str__(self):
return self.__repr__()
def __repr__(self):
return "{0}({1})".format(self.__class__.__name__, repr(self.db))

View File

@@ -0,0 +1,65 @@
import logging
from ...util import none_or
from .collection import Collection
logger = logging.getLogger("mw.database.collections.pages")
class Pages(Collection):
def get(self, page_id=None, namespace_title=None, rev_id=None):
"""
Gets a single page based on a legitimate identifier of the page. Note
that namespace_title expects a tuple of namespace ID and title.
:Parameters:
page_id : int
Page ID
namespace_title : ( int, str )
the page's namespace ID and title
rev_id : int
a revision ID included in the page's history
:Returns:
iterator over result rows
"""
page_id = none_or(page_id, int)
namespace_title = none_or(namespace_title, tuple)
rev_id = none_or(rev_id, int)
query = """
SELECT page.*
FROM page
"""
values = []
if page_id is not None:
query += """
WHERE page_id = %s
"""
values.append(page_id)
if namespace_title is not None:
namespace, title = namespace_title
query += " WHERE page_namespace = %s and page_title = %s "
values.extend([int(namespace), str(title)])
elif rev_id is not None:
query += """
WHERE page_id = (SELECT rev_page FROM revision WHERE rev_id = %s)
"""
values.append(rev_id)
else:
raise TypeError("Must specify a page identifier.")
cursor = self.db.shared_connection.cursor()
cursor.execute(
query,
values
)
for row in cursor:
return row

View File

@@ -0,0 +1,128 @@
import logging
import time
from ...types import Timestamp
from ...util import none_or
from .collection import Collection
logger = logging.getLogger("mw.database.collections.pages")
class RecentChanges(Collection):
# (https://www.mediawiki.org/wiki/Manual:Recentchanges_table)
TYPES = {
'edit': 0, # edit of existing page
'new': 1, # new page
'move': 2, # Marked as obsolete
'log': 3, # log action (introduced in MediaWiki 1.2)
'move_over_redirect': 4, # Marked as obsolete
'external': 5 # An external recent change. Primarily used by Wikidata
}
def listen(self, last=None, types=None, max_wait=5):
"""
Listens to the recent changes table. Given no parameters, this function
will return an iterator over the entire recentchanges table and then
continue to "listen" for new changes to come in every 5 seconds.
:Parameters:
last : dict
a recentchanges row to pick up after
types : set ( str )
a set of recentchanges types to filter for
max_wait : float
the maximum number of seconds to wait between repeated queries
:Returns:
A never-ending iterator over change rows.
"""
while True:
if last is not None:
after = last['rc_timestamp']
after_id = last['rc_id']
else:
after = None
after_id = None
start = time.time()
rcs = self.query(after=after, after_id=after_id, direction="newer")
count = 0
for rc in rcs:
yield rc
count += 1
time.sleep(max_wait - (time.time() - start))
def query(self, before=None, after=None, before_id=None, after_id=None,
types=None, direction=None, limit=None):
"""
Queries the ``recentchanges`` table. See
`<https://www.mediawiki.org/wiki/Manual:Recentchanges_table>`_
:Parameters:
before : :class:`mw.Timestamp`
The maximum timestamp
after : :class:`mw.Timestamp`
The minimum timestamp
before_id : int
The minimum ``rc_id``
after_id : int
The maximum ``rc_id``
types : set ( str )
Which types of changes to return?
* ``edit`` -- Edits to existing pages
* ``new`` -- Edits that create new pages
* ``move`` -- (obsolete)
* ``log`` -- Log actions (introduced in MediaWiki 1.2)
* ``move_over_redirect`` -- (obsolete)
* ``external`` -- An external recent change. Primarily used by Wikidata
direction : str
"older" or "newer"
limit : int
limit the number of records returned
"""
before = none_or(before, Timestamp)
after = none_or(after, Timestamp)
before_id = none_or(before_id, int)
after_id = none_or(after_id, int)
types = none_or(types, levels=self.TYPES)
direction = none_or(direction, levels=self.DIRECTIONS)
limit = none_or(limit, int)
query = """
SELECT * FROM recentchanges
WHERE 1
"""
values = []
if before is not None:
query += " AND rc_timestamp < %s "
values.append(before.short_format())
if after is not None:
query += " AND rc_timestamp < %s "
values.append(after.short_format())
if before_id is not None:
query += " AND rc_id < %s "
values.append(before_id)
if after_id is not None:
query += " AND rc_id < %s "
values.append(after_id)
if types is not None:
query += " AND rc_type IN ({0}) ".format(
",".join(self.TYPES[t] for t in types)
)
if direction is not None:
direction = ("ASC " if direction == "newer" else "DESC ")
query += " ORDER BY rc_timestamp {0}, rc_id {0}".format(dir)
if limit is not None:
query += " LIMIT %s "
values.append(limit)
cursor.execute(query, values)
for row in cursor:
yield row

View File

@@ -0,0 +1,410 @@
import logging
import time
from itertools import chain
from ...types import Timestamp
from ...util import iteration, none_or
from .collection import Collection
logger = logging.getLogger("mw.database.collections.revisions")
class AllRevisions(Collection):
def get(self, rev_id, include_page=False):
"""
Gets a single revisions by ID. Checks both the ``revision`` and
``archive`` tables. This method throws a :class:`KeyError` if a
revision cannot be found.
:Parameters:
rev_id : int
Revision ID
include_page : bool
Join revision returned against ``page``
:Returns:
A revision row
"""
rev_id = int(rev_id)
try:
rev_row = self.db.revisions.get(rev_id, include_page=include_page)
except KeyError as e:
rev_row = self.db.archives.get(rev_id)
return rev_row
def query(self, *args, **kwargs):
"""
Queries revisions (excludes revisions to deleted pages)
:Parameters:
page_id : int
Page identifier. Filter revisions to this page.
user_id : int
User identifier. Filter revisions to those made by this user.
user_text : str
User text (user_name or IP address). Filter revisions to those
made by this user.
before : :class:`mw.Timestamp`
Filter revisions to those made before this timestamp.
after : :class:`mw.Timestamp`
Filter revisions to those made after this timestamp.
before_id : int
Filter revisions to those with an ID before this ID
after_id : int
Filter revisions to those with an ID after this ID
direction : str
"newer" or "older"
limit : int
Limit the number of results
include_page : bool
Join revisions returned against ``page``
:Returns:
An iterator over revision rows.
"""
revisions = self.db.revisions.query(*args, **kwargs)
archives = self.db.archives.query(*args, **kwargs)
if 'direction' in kwargs:
direction = kwargs['direction']
if direction not in self.DIRECTIONS:
raise TypeError("direction must be in {0}".format(self.DIRECTIONS))
if direction == "newer":
collated_revisions = iteration.sequence(
revisions,
archives,
compare=lambda r1, r2:\
(r1['rev_timestamp'], r1['rev_id']) <=
(r2['rev_timestamp'], r2['rev_id'])
)
else: # direction == "older"
collated_revisions = iteration.sequence(
revisions,
archives,
compare=lambda r1, r2:\
(r1['rev_timestamp'], r1['rev_id']) >=
(r2['rev_timestamp'], r2['rev_id'])
)
else:
collated_revisions = chain(revisions, archives)
if 'limit' in kwargs:
limit = kwargs['limit']
for i, rev in enumerate(collated_revisions):
yield rev
if i >= limit:
break
else:
for rev in collated_revisions:
yield rev
class Revisions(Collection):
def get(self, rev_id, include_page=False):
"""
Gets a single revisions by ID. Checks the ``revision`` table. This
method throws a :class:`KeyError` if a revision cannot be found.
:Parameters:
rev_id : int
Revision ID
include_page : bool
Join revision returned against ``page``
:Returns:
A revision row
"""
rev_id = int(rev_id)
query = """
SELECT *, FALSE AS archived FROM revision
"""
if include_page:
query += """
INNER JOIN page ON page_id = rev_page
"""
query += " WHERE rev_id = %s"
cursor.execute(query, [rev_id])
for row in cursor:
return row
raise KeyError(rev_id)
def query(self, page_id=None, user_id=None, user_text=None,
before=None, after=None, before_id=None, after_id=None,
direction=None, limit=None, include_page=False):
"""
Queries revisions (excludes revisions to deleted pages)
:Parameters:
page_id : int
Page identifier. Filter revisions to this page.
user_id : int
User identifier. Filter revisions to those made by this user.
user_text : str
User text (user_name or IP address). Filter revisions to those
made by this user.
before : :class:`mw.Timestamp`
Filter revisions to those made before this timestamp.
after : :class:`mw.Timestamp`
Filter revisions to those made after this timestamp.
before_id : int
Filter revisions to those with an ID before this ID
after_id : int
Filter revisions to those with an ID after this ID
direction : str
"newer" or "older"
limit : int
Limit the number of results
include_page : bool
Join revisions returned against ``page``
:Returns:
An iterator over revision rows.
"""
start_time = time.time()
page_id = none_or(page_id, int)
user_id = none_or(user_id, int)
user_text = none_or(user_text, str)
before = none_or(before, Timestamp)
after = none_or(after, Timestamp)
before_id = none_or(before_id, int)
after_id = none_or(after_id, int)
direction = none_or(direction, levels=self.DIRECTIONS)
include_page = bool(include_page)
query = """
SELECT *, FALSE AS archived FROM revision
"""
if include_page:
query += """
INNER JOIN page ON page_id = rev_page
"""
query += """
WHERE 1
"""
values = []
if page_id is not None:
query += " AND rev_page = %s "
values.append(page_id)
if user_id is not None:
query += " AND rev_user = %s "
values.append(user_id)
if user_text is not None:
query += " AND rev_user_text = %s "
values.append(user_text)
if before is not None:
query += " AND rev_timestamp < %s "
values.append(before.short_format())
if after is not None:
query += " AND rev_timestamp > %s "
values.append(after.short_format())
if before_id is not None:
query += " AND rev_id < %s "
values.append(before_id)
if after_id is not None:
query += " AND rev_id > %s "
values.append(after_id)
if direction is not None:
direction = ("ASC " if direction == "newer" else "DESC ")
if before_id != None or after_id != None:
query += " ORDER BY rev_id {0}, rev_timestamp {0}".format(direction)
else:
query += " ORDER BY rev_timestamp {0}, rev_id {0}".format(direction)
if limit is not None:
query += " LIMIT %s "
values.append(limit)
cursor = self.db.shared_connection.cursor()
cursor.execute(query, values)
count = 0
for row in cursor:
yield row
count += 1
logger.debug("%s revisions read in %s seconds" % (count, time.time() - start_time))
class Archives(Collection):
def get(self, rev_id):
"""
Gets a single revisions by ID. Checks the ``archive`` table. This
method throws a :class:`KeyError` if a revision cannot be found.
:Parameters:
rev_id : int
Revision ID
:Returns:
A revision row
"""
rev_id = int(rev_id)
query = """
SELECT
ar_id,
ar_rev_id AS rev_id,
ar_page_id AS rev_page,
ar_page_id AS page_id,
ar_title AS page_title,
ar_namespace AS page_namespace,
ar_text_id AS rev_text_id,
ar_comment AS rev_comment,
ar_user AS rev_user,
ar_user_text AS rev_user_text,
ar_timestamp AS rev_timestamp,
ar_minor_edit AS rev_minor_edit,
ar_deleted AS rev_deleted,
ar_len AS rev_len,
ar_parent_id AS rev_parent_id,
ar_sha1 AS rev_sha1,
TRUE AS archived
FROM archive
WHERE ar_rev_id = %s
"""
cursor.execute(query, [rev_id])
for row in cursor:
return row
raise KeyError(rev_id)
def query(self, page_id=None, user_id=None, user_text=None,
before=None, after=None, before_id=None, after_id=None,
before_ar_id=None, after_ar_id=None,
direction=None, limit=None, include_page=True):
"""
Queries archived revisions (revisions of deleted pages)
:Parameters:
page_id : int
Page identifier. Filter revisions to this page.
user_id : int
User identifier. Filter revisions to those made by this user.
user_text : str
User text (user_name or IP address). Filter revisions to those
made by this user.
before : :class:`mw.Timestamp`
Filter revisions to those made before this timestamp.
after : :class:`mw.Timestamp`
Filter revisions to those made after this timestamp.
before_id : int
Filter revisions to those with an ID before this ID
after_id : int
Filter revisions to those with an ID after this ID
direction : str
"newer" or "older"
limit : int
Limit the number of results
include_page : bool
This field is ignored. It's only here for compatibility with
:class:`mw.database.Revision`.
:Returns:
An iterator over revision rows.
"""
page_id = none_or(page_id, int)
user_id = none_or(user_id, int)
before = none_or(before, Timestamp)
after = none_or(after, Timestamp)
before_id = none_or(before_id, int)
after_id = none_or(after_id, int)
direction = none_or(direction, levels=self.DIRECTIONS)
limit = none_or(limit, int)
start_time = time.time()
cursor = self.db.shared_connection.cursor()
query = """
SELECT
ar_id,
ar_rev_id AS rev_id,
ar_page_id AS rev_page,
ar_page_id AS page_id,
ar_title AS page_title,
ar_namespace AS page_namespace,
ar_text_id AS rev_text_id,
ar_comment AS rev_comment,
ar_user AS rev_user,
ar_user_text AS rev_user_text,
ar_timestamp AS rev_timestamp,
ar_minor_edit AS rev_minor_edit,
ar_deleted AS rev_deleted,
ar_len AS rev_len,
ar_parent_id AS rev_parent_id,
ar_sha1 AS rev_sha1,
TRUE AS archived
FROM archive
"""
query += """
WHERE 1
"""
values = []
if page_id is not None:
query += " AND ar_page_id = %s "
values.append(page_id)
if user_id is not None:
query += " AND ar_user = %s "
values.append(user_id)
if user_text is not None:
query += " AND ar_user_text = %s "
values.append(user_text)
if before is not None:
query += " AND ar_timestamp < %s "
values.append(before.short_format())
if after is not None:
query += " AND ar_timestamp > %s "
values.append(after.short_format())
if before_id is not None:
query += " AND ar_rev_id < %s "
values.append(before_id)
if after_id is not None:
query += " AND ar_rev_id > %s "
values.append(after_id)
if before_ar_id is not None:
query += " AND ar_id < ? "
values.append(before_ar_id)
if after_ar_id is not None:
query += " AND ar_id > ? "
values.append(after_ar_id)
if direction is not None:
dir = ("ASC " if direction == "newer" else "DESC ")
if before is not None or after is not None:
query += " ORDER BY ar_timestamp {0}, ar_rev_id {0}".format(dir)
elif before_id is not None or after_id is not None:
query += " ORDER BY ar_rev_id {0}, ar_timestamp {0}".format(dir)
else:
query += " ORDER BY ar_id {0}".format(dir)
if limit is not None:
query += " LIMIT %s "
values.append(limit)
cursor.execute(query, values)
count = 0
for row in cursor:
yield row
count += 1
logger.debug("%s revisions read in %s seconds" % (count, time.time() - start_time))

View File

@@ -0,0 +1,154 @@
import logging
import time
from ...types import Timestamp
from ...util import none_or
from .collection import Collection
logger = logging.getLogger("mw.database.collections.users")
class Users(Collection):
CREATION_ACTIONS = {'newusers', 'create', 'create2', 'autocreate',
'byemail'}
def get(self, user_id=None, user_name=None):
"""
Gets a single user row from the database. Raises a :class:`KeyError`
if a user cannot be found.
:Parameters:
user_id : int
User ID
user_name : str
User's name
:Returns:
A user row.
"""
user_id = none_or(user_id, int)
user_name = none_or(user_name, str)
query = """
SELECT user.*
FROM user
"""
values = []
if user_id is not None:
query += """
WHERE user_id = %s
"""
values.append(user_id)
elif user_name is not None:
query += """
WHERE user_name = %s
"""
values.append(user_name)
else:
raise TypeError("Must specify a user identifier.")
cursor = self.db.shared_connection.cursor()
cursor.execute(
query,
values
)
for row in cursor:
return row
raise KeyError(user_id if user_id is not None else user_name)
def query(self, registered_before=None, registered_after=None,
before_id=None, after_id=None, limit=None,
direction=None, self_created_only=False):
"""
Queries users based on various filtering parameters.
:Parameters:
registered_before : :class:`mw.Timestamp`
A timestamp to search before (inclusive)
registered_after : :class:`mw.Timestamp`
A timestamp to search after (inclusive)
before_id : int
A user_id to search before (inclusive)
after_id : int
A user_ud to search after (inclusive)
direction : str
"newer" or "older"
limit : int
Limit the results to at most this number
self_creations_only : bool
limit results to self_created user accounts
:Returns:
an iterator over ``user`` table rows
"""
start_time = time.time()
registered_before = none_or(registered_before, Timestamp)
registered_after = none_or(registered_after, Timestamp)
before_id = none_or(before_id, str)
after_id = none_or(after_id, str)
direction = none_or(direction, levels=self.DIRECTIONS)
limit = none_or(limit, int)
self_created_only = bool(self_created_only)
query = """
SELECT user.*
FROM user
"""
values = []
if self_created_only:
query += """
INNER JOIN logging ON
log_user = user_id
log_type = "newusers" AND
log_action = "create"
"""
query += "WHERE 1 "
if registered_before is not None:
query += "AND user_registration <= %s "
values.append(registered_before.short_format())
if registered_after is not None:
query += "AND user_registration >= %s "
values.append(registered_after.short_format())
if before_id is not None:
query += "AND user_id <= %s "
values.append(before_id)
if after_id is not None:
query += "AND user_id >= %s "
values.append(after_id)
query += "GROUP BY user_id " # In case of duplicate log events
if direction is not None:
if registered_before is not None or registered_after is not None:
if direction == "newer":
query += "ORDER BY user_registration ASC "
else:
query += "ORDER BY user_registration DESC "
else:
if direction == "newer":
query += "ORDER BY user_id ASC "
else:
query += "ORDER BY user_id DESC "
if limit is not None:
query += "LIMIT %s "
values.append(limit)
cursor = self.db.shared_connection.cursor()
cursor.execute(query, values)
count = 0
for row in cursor:
yield row
count += 1
logger.debug("%s users queried in %s seconds" % (count, time.time() - start_time))

View File

@@ -0,0 +1,134 @@
import getpass
import logging
import os
import pymysql
import pymysql.cursors
from .collections import AllRevisions, Archives, Pages, Revisions, Users
logger = logging.getLogger("mw.database.db")
class DB:
"""
Represents a connection to a MySQL database.
:Parameters:
connection = :class:`oursql.Connection`
A connection to a MediaWiki database
"""
def __init__(self, connection):
self.shared_connection = connection
self.shared_connection.cursorclass = pymysql.cursors.DictCursor
self.revisions = Revisions(self)
"""
An instance of :class:`mw.database.Revisions`.
"""
self.archives = Archives(self)
"""
An instance of :class:`mw.database.Archives`.
"""
self.all_revisions = AllRevisions(self)
"""
An instance of :class:`mw.database.AllRevisions`.
"""
self.pages = Pages(self)
"""
An instance of :class:`mw.database.Pages`.
"""
self.users = Users(self)
"""
An instance of :class:`mw.database.Users`.
"""
def __repr__(self):
return "%s(%s)" % (
self.__class__.__name__,
", ".join(
[repr(arg) for arg in self.args] +
["%s=%r" % (k, v) for k, v in self.kwargs.items()]
)
)
def __str__(self):
return self.__repr__()
@classmethod
def add_arguments(cls, parser, defaults=None):
"""
Adds the arguments to an :class:`argparse.ArgumentParser` in order to
create a database connection.
"""
defaults = defaults if defaults is not None else defaults
default_host = defaults.get('host', "localhost")
parser.add_argument(
'--host', '-h',
help="MySQL database host to connect to (defaults to {0})".format(default_host),
default=default_host
)
default_database = defaults.get('database', getpass.getuser())
parser.add_argument(
'--database', '-d',
help="MySQL database name to connect to (defaults to {0})".format(default_database),
default=default_database
)
default_defaults_file = defaults.get('defaults-file', os.path.expanduser("~/.my.cnf"))
parser.add_argument(
'--defaults-file',
help="MySQL defaults file (defaults to {0})".format(default_defaults_file),
default=default_defaults_file
)
default_user = defaults.get('user', getpass.getuser())
parser.add_argument(
'--user', '-u',
help="MySQL user (defaults to %s)".format(default_user),
default=default_user
)
return parser
@classmethod
def from_arguments(cls, args):
"""
Constructs a :class:`~mw.database.DB`.
Consumes :class:`argparse.ArgumentParser` arguments given by
:meth:`add_arguments` in order to create a :class:`DB`.
:Parameters:
args : :class:`argparse.Namespace`
A collection of argument values returned by :class:`argparse.ArgumentParser`'s :meth:`parse_args()`
"""
connection = pymysql.connect(
args.host,
args.user,
database=args.database,
read_default_file=args.defaults_file
)
return cls(connection)
@classmethod
def from_params(cls, *args, **kwargs):
"""
Constructs a :class:`~mw.database.DB`. Passes `*args` and `**kwargs`
to :meth:`oursql.connect` and configures the connection.
:Parameters:
args : :class:`argparse.Namespace`
A collection of argument values returned by :class:`argparse.ArgumentParser`'s :meth:`parse_args()`
"""
kwargs['cursorclass'] = pymysql.cursors.DictCursor
if kwargs['db']:
kwargs['database'] = kwargs['db']
del kwargs['db']
connection = pymysql.connect(*args, **kwargs)
return cls(connection)