diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..bd28b9c --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.9 diff --git a/pyproject.toml b/pyproject.toml index eafb09f..c54cef6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "mediawiki-dump-tools" version = "0.1.0" description = "Add your description here" readme = "README.md" -requires-python = ">=3.11" +requires-python = "~=3.9" dependencies = [ "deltas>=0.7.0", "mediawiki-utilities>=0.4.18", @@ -12,7 +12,7 @@ dependencies = [ "mwtypes>=0.4.0", "mwxml>=0.3.6", "pyarrow>=20.0.0", - "yamlconf", + "yamlconf>=0.2.6", ] [tool.uv.sources] diff --git a/tables.py b/tables.py index f911b90..2c0d204 100644 --- a/tables.py +++ b/tables.py @@ -2,7 +2,7 @@ import sys from abc import abstractmethod, ABC from datetime import datetime, timezone from hashlib import sha1 -from typing import Generic, TypeVar +from typing import Generic, TypeVar, Union import mwreverts import mwtypes @@ -89,10 +89,10 @@ class RevisionArticleId(RevisionField[int]): return page.id -class RevisionEditorId(RevisionField[int | None]): +class RevisionEditorId(RevisionField[Union[int, None]]): field = pa.field("editorid", pa.int64(), nullable=True) - def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None: + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]: revision = revisions[-1] if revision.deleted.user: return None @@ -100,10 +100,10 @@ class RevisionEditorId(RevisionField[int | None]): return revision.user.id -class RevisionIsAnon(RevisionField[bool | None]): +class RevisionIsAnon(RevisionField[Union[bool, None]]): field = pa.field("anon", pa.bool_(), nullable=True) - def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None: + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]: revision = revisions[-1] if revision.deleted.user: return None @@ -111,10 +111,10 @@ class RevisionIsAnon(RevisionField[bool | None]): return revision.user.id is None -class RevisionEditorText(RevisionField[str | None]): +class RevisionEditorText(RevisionField[Union[str, None]]): field = pa.field("editor", pa.string(), nullable=True) - def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None: + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]: revision = revisions[-1] if revision.deleted.user: return None @@ -152,10 +152,10 @@ class RevisionSha1(RevisionField[str]): return revision.sha1 -class RevisionTextChars(RevisionField[int | None]): +class RevisionTextChars(RevisionField[Union[int, None]]): field = pa.field("text_chars", pa.int32(), nullable=True) - def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None: + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]: revision = revisions[-1] if not revision.deleted.text: return len(revision.text) @@ -179,14 +179,14 @@ class RevisionIsMinor(RevisionField[bool]): return revision.minor -class RevisionReverts(RevisionField[str | None]): +class RevisionReverts(RevisionField[Union[str, None]]): def __init__(self): super().__init__() - self.rev_detector: mwreverts.Detector | None = None + self.rev_detector: Union[mwreverts.Detector, None] = None field = pa.field("reverteds", pa.string(), nullable=True) - def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None: + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]: if self.rev_detector is None: return None diff --git a/wikiq b/wikiq index 32882a3..c5e83a6 100755 --- a/wikiq +++ b/wikiq @@ -14,7 +14,7 @@ from itertools import groupby from subprocess import Popen, PIPE from collections import deque from hashlib import sha1 -from typing import Any, IO, TextIO, Generator +from typing import Any, IO, TextIO, Generator, Union import mwxml from mwxml import Dump @@ -100,7 +100,7 @@ class WikiqPage: self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list() @staticmethod - def user_text(rev) -> str | None: + def user_text(rev) -> Union[str, None]: return None if rev.deleted.user else rev.user.text def rev_list(self): @@ -203,15 +203,15 @@ class RegexPair(object): class WikiqParser: def __init__(self, - input_file: TextIOWrapper | IO[Any] | IO[bytes], - output_file: TextIO | str, + input_file: Union[TextIOWrapper, IO[Any], IO[bytes]], + output_file: Union[TextIO, str], regex_match_revision: list[str], regex_match_comment: list[str], regex_revision_label: list[str], regex_comment_label: list[str], collapse_user: bool = False, persist: int = None, - namespaces: list[int] | None = None, + namespaces: Union[list[int], None] = None, revert_radius: int = 15, output_parquet: bool = True, parquet_buffer_size: int = 2000): @@ -410,7 +410,7 @@ class WikiqParser: # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. row_buffer = table.pop() - is_revert_column: list[bool | None] = [] + is_revert_column: list[Union[bool, None]] = [] for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): if self.revert_radius == 0 or d: is_revert_column.append(None)