This will allow making columns optional, as desired, and make adding new columns straightforward without impacting existing behavior. Signed-off-by: Will Beason <willbeason@gmail.com>
137 lines
3.9 KiB
Python
137 lines
3.9 KiB
Python
from abc import abstractmethod, ABC
|
|
from datetime import datetime, timezone
|
|
from hashlib import sha1
|
|
from typing import Generic, TypeVar
|
|
|
|
import mwtypes
|
|
import mwxml
|
|
|
|
import pyarrow as pa
|
|
|
|
T = TypeVar('T')
|
|
|
|
|
|
class RevisionField(ABC, Generic[T]):
|
|
"""
|
|
Abstract type which represents a field in a table of page revisions.
|
|
"""
|
|
|
|
def __init__(self, field: pa.Field):
|
|
self.field = field
|
|
|
|
@abstractmethod
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T:
|
|
"""
|
|
:param page: The page for this set of revisions.
|
|
:param revisions: The set of revisions to compute the field from.
|
|
Revisions are passed in chronological order, so use revisions[-1] to
|
|
access the most recent revision in the set.
|
|
"""
|
|
pass
|
|
|
|
|
|
class RevisionTableColumn(Generic[T]):
|
|
def __init__(self, field: RevisionField[T]):
|
|
self.field: RevisionField = field
|
|
self.data: list[T] = []
|
|
|
|
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None:
|
|
self.data.append(self.field.extract(page, revisions))
|
|
|
|
def pop_column(self) -> list[T]:
|
|
data = self.data
|
|
self.data = []
|
|
return data
|
|
|
|
|
|
class RevisionTable:
|
|
columns: list[RevisionTableColumn]
|
|
|
|
def add_revision_set(self, page: mwtypes.Page, revisions: list[mwxml.Revision]):
|
|
for column in self.columns:
|
|
column.add(page, revisions)
|
|
|
|
|
|
class RevisionId(RevisionField[int]):
|
|
def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
revision = revisions[-1]
|
|
return revision.id
|
|
|
|
|
|
class RevisionTimestamp(RevisionField[datetime]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime:
|
|
revision = revisions[-1]
|
|
return revision.timestamp
|
|
|
|
|
|
class RevisionEditorId(RevisionField[int | None]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None:
|
|
revision = revisions[-1]
|
|
if revision.deleted.user or revision.user.id is None:
|
|
return None
|
|
|
|
return revision.user.id
|
|
|
|
|
|
class RevisionAnon(RevisionField[bool | None]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None:
|
|
revision = revisions[-1]
|
|
if revision.deleted.user:
|
|
return None
|
|
|
|
return revision.user.id is None
|
|
|
|
|
|
class RevisionEditorText(RevisionField[str | None]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None:
|
|
revision = revisions[-1]
|
|
if revision.deleted.user:
|
|
return None
|
|
|
|
return revision.user.text
|
|
|
|
|
|
class RevisionPageTitle(RevisionField[str]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
|
return page.title
|
|
|
|
|
|
class RevisionDeleted(RevisionField[bool]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
|
|
revision = revisions[-1]
|
|
return revision.deleted.text
|
|
|
|
|
|
class RevisionNamespace(RevisionField[int]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
return page.namespace
|
|
|
|
|
|
class RevisionSha1(RevisionField[str]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
|
revision = revisions[-1]
|
|
if revision.sha1:
|
|
return revision.sha1
|
|
|
|
return sha1(revision.sha1).hexdigest()
|
|
|
|
|
|
class RevisionTextChars(RevisionField[int]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
revision = revisions[-1]
|
|
return len(revision.text)
|
|
|
|
|
|
class RevisionMinor(RevisionField[bool]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
|
|
revision = revisions[-1]
|
|
return revision.minor
|
|
|
|
|
|
class RevisionCollapse(RevisionField[int]):
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
return len(revisions)
|
|
|
|
|
|
|