import sys from abc import abstractmethod, ABC from datetime import datetime, timezone from hashlib import sha1 from typing import Generic, TypeVar, Union import mwreverts import mwtypes import mwxml import pyarrow as pa T = TypeVar('T') class RevisionField(ABC, Generic[T]): def __init__(self): self.data: list[T] = [] """ Abstract type which represents a field in a table of page revisions. """ @property @abstractmethod def field(self) -> pa.Field: pass @abstractmethod def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T: """ :param page: The page for this set of revisions. :param revisions: The set of revisions to compute the field from. Revisions are passed in chronological order, so use revisions[-1] to access the most recent revision in the set. """ pass def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None: self.data.append(self.extract(page, revisions)) def pop(self) -> list[T]: data = self.data self.data = [] return data class RevisionTable: columns: list[RevisionField] def __init__(self, columns: list[RevisionField]): self.columns = columns def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]): for column in self.columns: column.add(page=page, revisions=revisions) def schema(self) -> pa.Schema: return pa.schema([c.field for c in self.columns]) def pop(self) -> dict: data = {} for column in self.columns: data[column.field.name] = column.pop() return data class RevisionId(RevisionField[int]): field = pa.field("revid", pa.int64()) def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: revision = revisions[-1] return revision.id class RevisionTimestamp(RevisionField[datetime]): field = pa.field("date_time", pa.timestamp('s')) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime: revision = revisions[-1] return revision.timestamp class RevisionArticleId(RevisionField[int]): field = pa.field("articleid", pa.int64()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: return page.id class RevisionEditorId(RevisionField[Union[int, None]]): field = pa.field("editorid", pa.int64(), nullable=True) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]: revision = revisions[-1] if revision.deleted.user: return None return revision.user.id class RevisionIsAnon(RevisionField[Union[bool, None]]): field = pa.field("anon", pa.bool_(), nullable=True) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]: revision = revisions[-1] if revision.deleted.user: return None return revision.user.id is None class RevisionEditorText(RevisionField[Union[str, None]]): field = pa.field("editor", pa.string(), nullable=True) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]: revision = revisions[-1] if revision.deleted.user: return None return revision.user.text class RevisionPageTitle(RevisionField[str]): field = pa.field("title", pa.string()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: return page.title class RevisionDeleted(RevisionField[bool]): field = pa.field("deleted", pa.bool_()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool: revision = revisions[-1] return revision.deleted.text class RevisionNamespace(RevisionField[int]): field = pa.field("namespace", pa.int32()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: return page.namespace class RevisionSha1(RevisionField[str]): field = pa.field("sha1", pa.string()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: revision = revisions[-1] return revision.sha1 class RevisionTextChars(RevisionField[Union[int, None]]): field = pa.field("text_chars", pa.int32(), nullable=True) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]: revision = revisions[-1] if not revision.deleted.text: return len(revision.text) return None class RevisionText(RevisionField[str]): field = pa.field("text", pa.string()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: revision = revisions[-1] return revision.text class RevisionIsMinor(RevisionField[bool]): field = pa.field("minor", pa.bool_()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool: revision = revisions[-1] return revision.minor class RevisionReverts(RevisionField[Union[str, None]]): def __init__(self): super().__init__() self.rev_detector: Union[mwreverts.Detector, None] = None field = pa.field("reverteds", pa.string(), nullable=True) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]: if self.rev_detector is None: return None revision = revisions[-1] if revision.deleted.text: return None revert = self.rev_detector.process(revision.sha1, revision.id) if revert is None: return None return ",".join([str(s) for s in revert.reverteds]) class RevisionCollapsed(RevisionField[int]): field = pa.field("collapsed_revs", pa.int64()) def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: return len(revisions)