mediawiki_dump_tools/tables.py
Will Beason 390499dd90 Pin to python 3.9
Since our execution environment requires this

Signed-off-by: Will Beason <willbeason@gmail.com>
2025-06-17 11:37:20 -05:00

209 lines
5.8 KiB
Python

import sys
from abc import abstractmethod, ABC
from datetime import datetime, timezone
from hashlib import sha1
from typing import Generic, TypeVar, Union
import mwreverts
import mwtypes
import mwxml
import pyarrow as pa
T = TypeVar('T')
class RevisionField(ABC, Generic[T]):
def __init__(self):
self.data: list[T] = []
"""
Abstract type which represents a field in a table of page revisions.
"""
@property
@abstractmethod
def field(self) -> pa.Field:
pass
@abstractmethod
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T:
"""
:param page: The page for this set of revisions.
:param revisions: The set of revisions to compute the field from.
Revisions are passed in chronological order, so use revisions[-1] to
access the most recent revision in the set.
"""
pass
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None:
self.data.append(self.extract(page, revisions))
def pop(self) -> list[T]:
data = self.data
self.data = []
return data
class RevisionTable:
columns: list[RevisionField]
def __init__(self, columns: list[RevisionField]):
self.columns = columns
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]):
for column in self.columns:
column.add(page=page, revisions=revisions)
def schema(self) -> pa.Schema:
return pa.schema([c.field for c in self.columns])
def pop(self) -> dict:
data = {}
for column in self.columns:
data[column.field.name] = column.pop()
return data
class RevisionId(RevisionField[int]):
field = pa.field("revid", pa.int64())
def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
revision = revisions[-1]
return revision.id
class RevisionTimestamp(RevisionField[datetime]):
field = pa.field("date_time", pa.timestamp('s'))
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime:
revision = revisions[-1]
return revision.timestamp
class RevisionArticleId(RevisionField[int]):
field = pa.field("articleid", pa.int64())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return page.id
class RevisionEditorId(RevisionField[Union[int, None]]):
field = pa.field("editorid", pa.int64(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.id
class RevisionIsAnon(RevisionField[Union[bool, None]]):
field = pa.field("anon", pa.bool_(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.id is None
class RevisionEditorText(RevisionField[Union[str, None]]):
field = pa.field("editor", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.text
class RevisionPageTitle(RevisionField[str]):
field = pa.field("title", pa.string())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
return page.title
class RevisionDeleted(RevisionField[bool]):
field = pa.field("deleted", pa.bool_())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
revision = revisions[-1]
return revision.deleted.text
class RevisionNamespace(RevisionField[int]):
field = pa.field("namespace", pa.int32())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return page.namespace
class RevisionSha1(RevisionField[str]):
field = pa.field("sha1", pa.string())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
return revision.sha1
class RevisionTextChars(RevisionField[Union[int, None]]):
field = pa.field("text_chars", pa.int32(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1]
if not revision.deleted.text:
return len(revision.text)
return None
class RevisionText(RevisionField[str]):
field = pa.field("text", pa.string())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
return revision.text
class RevisionIsMinor(RevisionField[bool]):
field = pa.field("minor", pa.bool_())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
revision = revisions[-1]
return revision.minor
class RevisionReverts(RevisionField[Union[str, None]]):
def __init__(self):
super().__init__()
self.rev_detector: Union[mwreverts.Detector, None] = None
field = pa.field("reverteds", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
if self.rev_detector is None:
return None
revision = revisions[-1]
if revision.deleted.text:
return None
revert = self.rev_detector.process(revision.sha1, revision.id)
if revert is None:
return None
return ",".join([str(s) for s in revert.reverteds])
class RevisionCollapsed(RevisionField[int]):
field = pa.field("collapsed_revs", pa.int64())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return len(revisions)