mediawiki_dump_tools/tables.py
Will Beason 8b0f775610 Begin move to columnar types
This will allow making columns optional, as desired, and make
adding new columns straightforward without impacting existing
behavior.

Signed-off-by: Will Beason <willbeason@gmail.com>
2025-06-03 08:52:57 -05:00

137 lines
3.9 KiB
Python

from abc import abstractmethod, ABC
from datetime import datetime, timezone
from hashlib import sha1
from typing import Generic, TypeVar
import mwtypes
import mwxml
import pyarrow as pa
T = TypeVar('T')
class RevisionField(ABC, Generic[T]):
"""
Abstract type which represents a field in a table of page revisions.
"""
def __init__(self, field: pa.Field):
self.field = field
@abstractmethod
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T:
"""
:param page: The page for this set of revisions.
:param revisions: The set of revisions to compute the field from.
Revisions are passed in chronological order, so use revisions[-1] to
access the most recent revision in the set.
"""
pass
class RevisionTableColumn(Generic[T]):
def __init__(self, field: RevisionField[T]):
self.field: RevisionField = field
self.data: list[T] = []
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None:
self.data.append(self.field.extract(page, revisions))
def pop_column(self) -> list[T]:
data = self.data
self.data = []
return data
class RevisionTable:
columns: list[RevisionTableColumn]
def add_revision_set(self, page: mwtypes.Page, revisions: list[mwxml.Revision]):
for column in self.columns:
column.add(page, revisions)
class RevisionId(RevisionField[int]):
def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
revision = revisions[-1]
return revision.id
class RevisionTimestamp(RevisionField[datetime]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime:
revision = revisions[-1]
return revision.timestamp
class RevisionEditorId(RevisionField[int | None]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None:
revision = revisions[-1]
if revision.deleted.user or revision.user.id is None:
return None
return revision.user.id
class RevisionAnon(RevisionField[bool | None]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.id is None
class RevisionEditorText(RevisionField[str | None]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.text
class RevisionPageTitle(RevisionField[str]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
return page.title
class RevisionDeleted(RevisionField[bool]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
revision = revisions[-1]
return revision.deleted.text
class RevisionNamespace(RevisionField[int]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return page.namespace
class RevisionSha1(RevisionField[str]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
if revision.sha1:
return revision.sha1
return sha1(revision.sha1).hexdigest()
class RevisionTextChars(RevisionField[int]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
revision = revisions[-1]
return len(revision.text)
class RevisionMinor(RevisionField[bool]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
revision = revisions[-1]
return revision.minor
class RevisionCollapse(RevisionField[int]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return len(revisions)