229 lines
6.6 KiB
Python
229 lines
6.6 KiB
Python
import sys
|
|
from abc import abstractmethod, ABC
|
|
from datetime import datetime, timezone
|
|
from hashlib import sha1
|
|
from typing import Generic, TypeVar, Union
|
|
|
|
import mwreverts
|
|
import mwtypes
|
|
import mwxml
|
|
|
|
import pyarrow as pa
|
|
|
|
T = TypeVar('T')
|
|
|
|
|
|
class RevisionField(ABC, Generic[T]):
|
|
def __init__(self):
|
|
self.data: list[T] = []
|
|
|
|
"""
|
|
Abstract type which represents a field in a table of page revisions.
|
|
"""
|
|
|
|
@property
|
|
@abstractmethod
|
|
def field(self) -> pa.Field:
|
|
pass
|
|
|
|
@abstractmethod
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> T:
|
|
"""
|
|
:param page: The page for this set of revisions.
|
|
:param revisions: The set of revisions to compute the field from.
|
|
Revisions are passed in chronological order, so use revisions[-1] to
|
|
access the most recent revision in the set.
|
|
|
|
Implementations of extract should handle the case where revisions is
|
|
either a single revision (collapse-user=FALSE), or a full edit session
|
|
of contiguous edits by the same user (collapse-user=TRUE).
|
|
"""
|
|
pass
|
|
|
|
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> None:
|
|
self.data.append(self.extract(page, revisions))
|
|
|
|
def pop(self) -> list[T]:
|
|
data = self.data
|
|
self.data = []
|
|
return data
|
|
|
|
|
|
class RevisionTable:
|
|
columns: list[RevisionField]
|
|
|
|
def __init__(self, columns: list[RevisionField]):
|
|
self.columns = columns
|
|
|
|
def add(self, page: mwtypes.Page, revisions: list[mwxml.Revision]):
|
|
for column in self.columns:
|
|
column.add(page=page, revisions=revisions)
|
|
|
|
def schema(self) -> pa.Schema:
|
|
return pa.schema([c.field for c in self.columns])
|
|
|
|
def pop(self) -> dict:
|
|
data = {}
|
|
for column in self.columns:
|
|
data[column.field.name] = column.pop()
|
|
|
|
return data
|
|
|
|
|
|
class RevisionId(RevisionField[int]):
|
|
field = pa.field("revid", pa.int64())
|
|
|
|
def extract(self, _: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
revision = revisions[-1]
|
|
return revision.id
|
|
|
|
|
|
class RevisionTimestamp(RevisionField[datetime]):
|
|
field = pa.field("date_time", pa.timestamp('s'))
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> datetime:
|
|
revision = revisions[-1]
|
|
return revision.timestamp
|
|
|
|
|
|
class RevisionArticleId(RevisionField[int]):
|
|
field = pa.field("articleid", pa.int64())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
return page.id
|
|
|
|
|
|
class RevisionEditorId(RevisionField[Union[int, None]]):
|
|
field = pa.field("editorid", pa.int64(), nullable=True)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
|
|
revision = revisions[-1]
|
|
if revision.deleted.user:
|
|
return None
|
|
|
|
return revision.user.id
|
|
|
|
|
|
class RevisionEditSummary(RevisionField[Union[str, None]]):
|
|
field = pa.field("edit_summary", pa.string(), nullable=True)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
|
|
revision = revisions[-1]
|
|
|
|
return revision.comment
|
|
|
|
class RevisionIsAnon(RevisionField[Union[bool, None]]):
|
|
field = pa.field("anon", pa.bool_(), nullable=True)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]:
|
|
revision = revisions[-1]
|
|
if revision.deleted.user:
|
|
return None
|
|
|
|
return revision.user.id is None
|
|
|
|
|
|
class RevisionEditorText(RevisionField[Union[str, None]]):
|
|
field = pa.field("editor", pa.string(), nullable=True)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
|
|
revision = revisions[-1]
|
|
if revision.deleted.user:
|
|
return None
|
|
|
|
return revision.user.text
|
|
|
|
|
|
class RevisionPageTitle(RevisionField[str]):
|
|
field = pa.field("title", pa.string())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
|
return page.title
|
|
|
|
|
|
class RevisionDeleted(RevisionField[bool]):
|
|
field = pa.field("deleted", pa.bool_())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
|
|
revision = revisions[-1]
|
|
return revision.deleted.text
|
|
|
|
|
|
class RevisionNamespace(RevisionField[int]):
|
|
field = pa.field("namespace", pa.int32())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
return page.namespace
|
|
|
|
|
|
class RevisionSha1(RevisionField[str]):
|
|
field = pa.field("sha1", pa.string())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
|
revision = revisions[-1]
|
|
return revision.sha1
|
|
|
|
|
|
class RevisionTextChars(RevisionField[Union[int, None]]):
|
|
field = pa.field("text_chars", pa.int32(), nullable=True)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
|
|
revision = revisions[-1]
|
|
if not revision.deleted.text:
|
|
return len(revision.text)
|
|
|
|
return None
|
|
|
|
|
|
class RevisionText(RevisionField[str]):
|
|
field = pa.field("text", pa.string())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
|
revision = revisions[-1]
|
|
return revision.text
|
|
|
|
|
|
class RevisionIsMinor(RevisionField[bool]):
|
|
field = pa.field("minor", pa.bool_())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool:
|
|
revision = revisions[-1]
|
|
return revision.minor
|
|
|
|
|
|
class RevisionReverts(RevisionField[Union[str, None]]):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.rev_detector: Union[mwreverts.Detector, None] = None
|
|
|
|
field = pa.field("reverteds", pa.string(), nullable=True)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
|
|
if self.rev_detector is None:
|
|
return None
|
|
|
|
revision = revisions[-1]
|
|
if revision.deleted.text:
|
|
return None
|
|
|
|
revert = self.rev_detector.process(revision.sha1, revision.id)
|
|
if revert is None:
|
|
return None
|
|
|
|
return ",".join([str(s) for s in revert.reverteds])
|
|
|
|
|
|
class RevisionCollapsed(RevisionField[int]):
|
|
field = pa.field("collapsed_revs", pa.int64())
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
|
|
return len(revisions)
|
|
|
|
|
|
class RevisionText(RevisionField[str]):
|
|
field = pa.field("text", pa.string(), nullable=False)
|
|
|
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
|
revision = revisions[-1]
|
|
return revision.text
|