mediawiki_dump_tools/tables.py
2025-06-17 12:20:19 -05:00

221 lines
6.4 KiB
Python

import sys
from abc import abstractmethod, ABC
from datetime import datetime, timezone
from hashlib import sha1
from typing import Generic, TypeVar, Union
import mwreverts
import mwtypes
import mwxml
import pyarrow as pa
T = TypeVar('T')
class RevisionField(ABC, Generic[T]):
def __init__(self):
self.data: list[T] = []
"""
Abstract type which represents a field in a table of page revisions.
"""
@property
@abstractmethod
def field(self) -> pa.Field:
pass
@abstractmethod
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T:
"""
:param page: The page for this set of revisions.
:param revisions: The set of revisions to compute the field from.
Revisions are passed in chronological order, so use revisions[-1] to
access the most recent revision in the set.
Implementations of extract should handle the case where revisions is
either a single revision (collapse-user=FALSE), or a full edit session
of contiguous edits by the same user (collapse-user=TRUE).
"""
pass
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None:
self.data.append(self.extract(page, revisions))
def pop(self) -> list[T]:
data = self.data
self.data = []
return data
class RevisionTable:
columns: list[RevisionField]
def __init__(self, columns: list[RevisionField]):
self.columns = columns
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]):
for column in self.columns:
column.add(page=page, revisions=revisions)
def schema(self) -> pa.Schema:
return pa.schema([c.field for c in self.columns])
def pop(self) -> dict:
data = {}
for column in self.columns:
data[column.field.name] = column.pop()
return data
class RevisionId(RevisionField[int]):
field = pa.field("revid", pa.int64())
def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
revision = revisions[-1]
return revision.id
class RevisionTimestamp(RevisionField[datetime]):
field = pa.field("date_time", pa.timestamp('s'))
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime:
revision = revisions[-1]
return revision.timestamp
class RevisionArticleId(RevisionField[int]):
field = pa.field("articleid", pa.int64())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return page.id
class RevisionEditorId(RevisionField[Union[int, None]]):
field = pa.field("editorid", pa.int64(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.id
class RevisionEditSummary(RevisionField[Union[str, None]]):
field = pa.field("edit_summary", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
revision = revisions[-1]
return revision.comment
class RevisionIsAnon(RevisionField[Union[bool, None]]):
field = pa.field("anon", pa.bool_(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.id is None
class RevisionEditorText(RevisionField[Union[str, None]]):
field = pa.field("editor", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
return revision.user.text
class RevisionPageTitle(RevisionField[str]):
field = pa.field("title", pa.string())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
return page.title
class RevisionDeleted(RevisionField[bool]):
field = pa.field("deleted", pa.bool_())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
revision = revisions[-1]
return revision.deleted.text
class RevisionNamespace(RevisionField[int]):
field = pa.field("namespace", pa.int32())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return page.namespace
class RevisionSha1(RevisionField[str]):
field = pa.field("sha1", pa.string())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
return revision.sha1
class RevisionTextChars(RevisionField[Union[int, None]]):
field = pa.field("text_chars", pa.int32(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1]
if not revision.deleted.text:
return len(revision.text)
return None
class RevisionText(RevisionField[str]):
field = pa.field("text", pa.string())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
return revision.text
class RevisionIsMinor(RevisionField[bool]):
field = pa.field("minor", pa.bool_())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
revision = revisions[-1]
return revision.minor
class RevisionReverts(RevisionField[Union[str, None]]):
def __init__(self):
super().__init__()
self.rev_detector: Union[mwreverts.Detector, None] = None
field = pa.field("reverteds", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
if self.rev_detector is None:
return None
revision = revisions[-1]
if revision.deleted.text:
return None
revert = self.rev_detector.process(revision.sha1, revision.id)
if revert is None:
return None
return ",".join([str(s) for s in revert.reverteds])
class RevisionCollapsed(RevisionField[int]):
field = pa.field("collapsed_revs", pa.int64())
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return len(revisions)