diff --git a/tables.py b/tables.py new file mode 100644 index 0000000..1e5b2d0 --- /dev/null +++ b/tables.py @@ -0,0 +1,136 @@ +from abc import abstractmethod, ABC +from datetime import datetime, timezone +from hashlib import sha1 +from typing import Generic, TypeVar + +import mwtypes +import mwxml + +import pyarrow as pa + +T = TypeVar('T') + + +class RevisionField(ABC, Generic[T]): + """ + Abstract type which represents a field in a table of page revisions. + """ + + def __init__(self, field: pa.Field): + self.field = field + + @abstractmethod + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T: + """ + :param page: The page for this set of revisions. + :param revisions: The set of revisions to compute the field from. + Revisions are passed in chronological order, so use revisions[-1] to + access the most recent revision in the set. + """ + pass + + +class RevisionTableColumn(Generic[T]): + def __init__(self, field: RevisionField[T]): + self.field: RevisionField = field + self.data: list[T] = [] + + def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None: + self.data.append(self.field.extract(page, revisions)) + + def pop_column(self) -> list[T]: + data = self.data + self.data = [] + return data + + +class RevisionTable: + columns: list[RevisionTableColumn] + + def add_revision_set(self, page: mwtypes.Page, revisions: list[mwxml.Revision]): + for column in self.columns: + column.add(page, revisions) + + +class RevisionId(RevisionField[int]): + def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: + revision = revisions[-1] + return revision.id + + +class RevisionTimestamp(RevisionField[datetime]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime: + revision = revisions[-1] + return revision.timestamp + + +class RevisionEditorId(RevisionField[int | None]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None: + revision = revisions[-1] + if revision.deleted.user or revision.user.id is None: + return None + + return revision.user.id + + +class RevisionAnon(RevisionField[bool | None]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None: + revision = revisions[-1] + if revision.deleted.user: + return None + + return revision.user.id is None + + +class RevisionEditorText(RevisionField[str | None]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None: + revision = revisions[-1] + if revision.deleted.user: + return None + + return revision.user.text + + +class RevisionPageTitle(RevisionField[str]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: + return page.title + + +class RevisionDeleted(RevisionField[bool]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool: + revision = revisions[-1] + return revision.deleted.text + + +class RevisionNamespace(RevisionField[int]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: + return page.namespace + + +class RevisionSha1(RevisionField[str]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: + revision = revisions[-1] + if revision.sha1: + return revision.sha1 + + return sha1(revision.sha1).hexdigest() + + +class RevisionTextChars(RevisionField[int]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: + revision = revisions[-1] + return len(revision.text) + + +class RevisionMinor(RevisionField[bool]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool: + revision = revisions[-1] + return revision.minor + + +class RevisionCollapse(RevisionField[int]): + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: + return len(revisions) + + + diff --git a/wikiq b/wikiq index f849ae6..73f5c8e 100755 --- a/wikiq +++ b/wikiq @@ -15,16 +15,16 @@ from itertools import groupby from subprocess import Popen, PIPE from collections import deque from hashlib import sha1 -from typing import Any, IO, TextIO, Final +from typing import Any, IO, TextIO, Final, Generator +import mwxml from mwxml import Dump from deltas.tokenizers import wikitext_split import mwpersistence import mwreverts -from pyarrow import Array, Table, Schema, DataType -from pyarrow.parquet import ParquetWriter +from pyarrow import Schema TO_ENCODE = ('title', 'editor') PERSISTENCE_RADIUS = 7 @@ -57,7 +57,7 @@ class WikiqIterator: self.mwiterator = Dump.from_file(self.fh) self.namespace_map = {ns.id: ns.name for ns in self.mwiterator.site_info.namespaces} - self.__pages = self.load_pages() + self.__pages: Generator[WikiqPage] = self.load_pages() def load_pages(self): for page in self.mwiterator: @@ -92,7 +92,7 @@ class WikiqPage: self.restrictions = page.restrictions self.collapse_user = collapse_user self.mwpage = page - self.__revisions = self.rev_list() + self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list() @staticmethod def user_text(rev) -> str | None: @@ -116,7 +116,7 @@ class WikiqPage: for _, revs in groupby(self.mwpage, self.user_text): # All revisions are either from the same user, or this is a single # revision where the user is missing. - yield revs + yield list(revs) def __iter__(self): return self.__revisions @@ -152,7 +152,7 @@ class RegexPair(object): def _make_key(self, cap_group): return "{}_{}".format(self.label, cap_group) - def matchmake(self, content, rev_data): + def matchmake(self, content: str, rev_data): temp_dict = {} # if there are named capture groups in the regex @@ -202,7 +202,7 @@ class RegexPair(object): def pa_schema() -> pa.Schema: - fields = [ + fields: list[pa.Field] = [ pa.field("revid", pa.int64()), pa.field("date_time", pa.timestamp('s')), pa.field("articleid", pa.int64()), @@ -222,6 +222,8 @@ def pa_schema() -> pa.Schema: ] return pa.schema(fields) + + """ We used to use a dictionary to collect fields for the output. @@ -430,18 +432,19 @@ class WikiqParser: else: sys.exit('Each regular expression *must* come with a corresponding label and vice versa.') - def matchmake_revision(self, rev, rev_data): + def matchmake_revision(self, rev: mwxml.Revision, rev_data: Revision): rev_data = self.matchmake_text(rev.text, rev_data) rev_data = self.matchmake_comment(rev.comment, rev_data) return rev_data - def matchmake_text(self, text, rev_data): + def matchmake_text(self, text: str, rev_data: Revision): return self.matchmake_pairs(text, rev_data, self.regex_revision_pairs) - def matchmake_comment(self, comment, rev_data): + def matchmake_comment(self, comment: str, rev_data: Revision): return self.matchmake_pairs(comment, rev_data, self.regex_comment_pairs) - def matchmake_pairs(self, text, rev_data, pairs): + @staticmethod + def matchmake_pairs(text, rev_data, pairs): for pair in pairs: rev_data = pair.matchmake(text, rev_data) return rev_data @@ -485,11 +488,12 @@ class WikiqParser: # Iterate through pages for page in dump: - namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title) + if page.namespace is None: + page.namespace = self.__get_namespace_from_title(page.title) # skip namespaces not in the filter if self.namespace_filter is not None: - if namespace not in self.namespace_filter: + if page.namespace not in self.namespace_filter: continue # Disable detecting reverts if radius is 0. @@ -517,7 +521,6 @@ class WikiqParser: # Iterate through a page's revisions prev_text_chars = 0 for revs in page: - revs = list(revs) rev = revs[-1] editorid = None if rev.deleted.user or rev.user.id is None else rev.user.id @@ -529,7 +532,7 @@ class WikiqParser: editorid=editorid, title=page.title, deleted=rev.deleted.text, - namespace=namespace + namespace=page.namespace ) rev_data = self.matchmake_revision(rev, rev_data)