Pin to python 3.9

Since our execution environment requires this

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-17 11:37:20 -05:00
parent 84d464ea38
commit 390499dd90
4 changed files with 21 additions and 20 deletions

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.9

View File

@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
requires-python = "~=3.9"
dependencies = [
"deltas>=0.7.0",
"mediawiki-utilities>=0.4.18",
@ -12,7 +12,7 @@ dependencies = [
"mwtypes>=0.4.0",
"mwxml>=0.3.6",
"pyarrow>=20.0.0",
"yamlconf",
"yamlconf>=0.2.6",
]
[tool.uv.sources]

View File

@ -2,7 +2,7 @@ import sys
from abc import abstractmethod, ABC
from datetime import datetime, timezone
from hashlib import sha1
from typing import Generic, TypeVar
from typing import Generic, TypeVar, Union
import mwreverts
import mwtypes
@ -89,10 +89,10 @@ class RevisionArticleId(RevisionField[int]):
return page.id
class RevisionEditorId(RevisionField[int | None]):
class RevisionEditorId(RevisionField[Union[int, None]]):
field = pa.field("editorid", pa.int64(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None:
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
@ -100,10 +100,10 @@ class RevisionEditorId(RevisionField[int | None]):
return revision.user.id
class RevisionIsAnon(RevisionField[bool | None]):
class RevisionIsAnon(RevisionField[Union[bool, None]]):
field = pa.field("anon", pa.bool_(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None:
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
@ -111,10 +111,10 @@ class RevisionIsAnon(RevisionField[bool | None]):
return revision.user.id is None
class RevisionEditorText(RevisionField[str | None]):
class RevisionEditorText(RevisionField[Union[str, None]]):
field = pa.field("editor", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None:
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
revision = revisions[-1]
if revision.deleted.user:
return None
@ -152,10 +152,10 @@ class RevisionSha1(RevisionField[str]):
return revision.sha1
class RevisionTextChars(RevisionField[int | None]):
class RevisionTextChars(RevisionField[Union[int, None]]):
field = pa.field("text_chars", pa.int32(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None:
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1]
if not revision.deleted.text:
return len(revision.text)
@ -179,14 +179,14 @@ class RevisionIsMinor(RevisionField[bool]):
return revision.minor
class RevisionReverts(RevisionField[str | None]):
class RevisionReverts(RevisionField[Union[str, None]]):
def __init__(self):
super().__init__()
self.rev_detector: mwreverts.Detector | None = None
self.rev_detector: Union[mwreverts.Detector, None] = None
field = pa.field("reverteds", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None:
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
if self.rev_detector is None:
return None

12
wikiq
View File

@ -14,7 +14,7 @@ from itertools import groupby
from subprocess import Popen, PIPE
from collections import deque
from hashlib import sha1
from typing import Any, IO, TextIO, Generator
from typing import Any, IO, TextIO, Generator, Union
import mwxml
from mwxml import Dump
@ -100,7 +100,7 @@ class WikiqPage:
self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list()
@staticmethod
def user_text(rev) -> str | None:
def user_text(rev) -> Union[str, None]:
return None if rev.deleted.user else rev.user.text
def rev_list(self):
@ -203,15 +203,15 @@ class RegexPair(object):
class WikiqParser:
def __init__(self,
input_file: TextIOWrapper | IO[Any] | IO[bytes],
output_file: TextIO | str,
input_file: Union[TextIOWrapper, IO[Any], IO[bytes]],
output_file: Union[TextIO, str],
regex_match_revision: list[str],
regex_match_comment: list[str],
regex_revision_label: list[str],
regex_comment_label: list[str],
collapse_user: bool = False,
persist: int = None,
namespaces: list[int] | None = None,
namespaces: Union[list[int], None] = None,
revert_radius: int = 15,
output_parquet: bool = True,
parquet_buffer_size: int = 2000):
@ -410,7 +410,7 @@ class WikiqParser:
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
row_buffer = table.pop()
is_revert_column: list[bool | None] = []
is_revert_column: list[Union[bool, None]] = []
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
if self.revert_radius == 0 or d:
is_revert_column.append(None)