from abc import abstractmethod, ABC from datetime import datetime, timezone from hashlib import sha1 from typing import Generic, TypeVar import mwtypes import mwxml import pyarrow as pa T = TypeVar('T') class RevisionField(ABC, Generic[T]): """ Abstract type which represents a field in a table of page revisions. """ def __init__(self, field: pa.Field): self.field = field @abstractmethod def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T: """ :param page: The page for this set of revisions. :param revisions: The set of revisions to compute the field from. Revisions are passed in chronological order, so use revisions[-1] to access the most recent revision in the set. """ pass class RevisionTableColumn(Generic[T]): def __init__(self, field: RevisionField[T]): self.field: RevisionField = field self.data: list[T] = [] def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None: self.data.append(self.field.extract(page, revisions)) def pop_column(self) -> list[T]: data = self.data self.data = [] return data class RevisionTable: columns: list[RevisionTableColumn] def add_revision_set(self, page: mwtypes.Page, revisions: list[mwxml.Revision]): for column in self.columns: column.add(page, revisions) class RevisionId(RevisionField[int]): def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: revision = revisions[-1] return revision.id class RevisionTimestamp(RevisionField[datetime]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime: revision = revisions[-1] return revision.timestamp class RevisionEditorId(RevisionField[int | None]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None: revision = revisions[-1] if revision.deleted.user or revision.user.id is None: return None return revision.user.id class RevisionAnon(RevisionField[bool | None]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None: revision = revisions[-1] if revision.deleted.user: return None return revision.user.id is None class RevisionEditorText(RevisionField[str | None]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None: revision = revisions[-1] if revision.deleted.user: return None return revision.user.text class RevisionPageTitle(RevisionField[str]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: return page.title class RevisionDeleted(RevisionField[bool]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool: revision = revisions[-1] return revision.deleted.text class RevisionNamespace(RevisionField[int]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: return page.namespace class RevisionSha1(RevisionField[str]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: revision = revisions[-1] if revision.sha1: return revision.sha1 return sha1(revision.sha1).hexdigest() class RevisionTextChars(RevisionField[int]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: revision = revisions[-1] return len(revision.text) class RevisionMinor(RevisionField[bool]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool: revision = revisions[-1] return revision.minor class RevisionCollapse(RevisionField[int]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: return len(revisions)