Pin to python 3.9

Since our execution environment requires this

Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
Will Beason 2025-06-17 11:37:20 -05:00
parent 84d464ea38
commit 390499dd90
4 changed files with 21 additions and 20 deletions

1
.python-version Normal file
View File

@ -0,0 +1 @@
3.9

View File

@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
version = "0.1.0" version = "0.1.0"
description = "Add your description here" description = "Add your description here"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = "~=3.9"
dependencies = [ dependencies = [
"deltas>=0.7.0", "deltas>=0.7.0",
"mediawiki-utilities>=0.4.18", "mediawiki-utilities>=0.4.18",
@ -12,7 +12,7 @@ dependencies = [
"mwtypes>=0.4.0", "mwtypes>=0.4.0",
"mwxml>=0.3.6", "mwxml>=0.3.6",
"pyarrow>=20.0.0", "pyarrow>=20.0.0",
"yamlconf", "yamlconf>=0.2.6",
] ]
[tool.uv.sources] [tool.uv.sources]

View File

@ -2,7 +2,7 @@ import sys
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
from datetime import datetime, timezone from datetime import datetime, timezone
from hashlib import sha1 from hashlib import sha1
from typing import Generic, TypeVar from typing import Generic, TypeVar, Union
import mwreverts import mwreverts
import mwtypes import mwtypes
@ -89,10 +89,10 @@ class RevisionArticleId(RevisionField[int]):
return page.id return page.id
class RevisionEditorId(RevisionField[int | None]): class RevisionEditorId(RevisionField[Union[int, None]]):
field = pa.field("editorid", pa.int64(), nullable=True) field = pa.field("editorid", pa.int64(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1] revision = revisions[-1]
if revision.deleted.user: if revision.deleted.user:
return None return None
@ -100,10 +100,10 @@ class RevisionEditorId(RevisionField[int | None]):
return revision.user.id return revision.user.id
class RevisionIsAnon(RevisionField[bool | None]): class RevisionIsAnon(RevisionField[Union[bool, None]]):
field = pa.field("anon", pa.bool_(), nullable=True) field = pa.field("anon", pa.bool_(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]:
revision = revisions[-1] revision = revisions[-1]
if revision.deleted.user: if revision.deleted.user:
return None return None
@ -111,10 +111,10 @@ class RevisionIsAnon(RevisionField[bool | None]):
return revision.user.id is None return revision.user.id is None
class RevisionEditorText(RevisionField[str | None]): class RevisionEditorText(RevisionField[Union[str, None]]):
field = pa.field("editor", pa.string(), nullable=True) field = pa.field("editor", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
revision = revisions[-1] revision = revisions[-1]
if revision.deleted.user: if revision.deleted.user:
return None return None
@ -152,10 +152,10 @@ class RevisionSha1(RevisionField[str]):
return revision.sha1 return revision.sha1
class RevisionTextChars(RevisionField[int | None]): class RevisionTextChars(RevisionField[Union[int, None]]):
field = pa.field("text_chars", pa.int32(), nullable=True) field = pa.field("text_chars", pa.int32(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
revision = revisions[-1] revision = revisions[-1]
if not revision.deleted.text: if not revision.deleted.text:
return len(revision.text) return len(revision.text)
@ -179,14 +179,14 @@ class RevisionIsMinor(RevisionField[bool]):
return revision.minor return revision.minor
class RevisionReverts(RevisionField[str | None]): class RevisionReverts(RevisionField[Union[str, None]]):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.rev_detector: mwreverts.Detector | None = None self.rev_detector: Union[mwreverts.Detector, None] = None
field = pa.field("reverteds", pa.string(), nullable=True) field = pa.field("reverteds", pa.string(), nullable=True)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
if self.rev_detector is None: if self.rev_detector is None:
return None return None

12
wikiq
View File

@ -14,7 +14,7 @@ from itertools import groupby
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from collections import deque from collections import deque
from hashlib import sha1 from hashlib import sha1
from typing import Any, IO, TextIO, Generator from typing import Any, IO, TextIO, Generator, Union
import mwxml import mwxml
from mwxml import Dump from mwxml import Dump
@ -100,7 +100,7 @@ class WikiqPage:
self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list() self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list()
@staticmethod @staticmethod
def user_text(rev) -> str | None: def user_text(rev) -> Union[str, None]:
return None if rev.deleted.user else rev.user.text return None if rev.deleted.user else rev.user.text
def rev_list(self): def rev_list(self):
@ -203,15 +203,15 @@ class RegexPair(object):
class WikiqParser: class WikiqParser:
def __init__(self, def __init__(self,
input_file: TextIOWrapper | IO[Any] | IO[bytes], input_file: Union[TextIOWrapper, IO[Any], IO[bytes]],
output_file: TextIO | str, output_file: Union[TextIO, str],
regex_match_revision: list[str], regex_match_revision: list[str],
regex_match_comment: list[str], regex_match_comment: list[str],
regex_revision_label: list[str], regex_revision_label: list[str],
regex_comment_label: list[str], regex_comment_label: list[str],
collapse_user: bool = False, collapse_user: bool = False,
persist: int = None, persist: int = None,
namespaces: list[int] | None = None, namespaces: Union[list[int], None] = None,
revert_radius: int = 15, revert_radius: int = 15,
output_parquet: bool = True, output_parquet: bool = True,
parquet_buffer_size: int = 2000): parquet_buffer_size: int = 2000):
@ -410,7 +410,7 @@ class WikiqParser:
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them. # Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
row_buffer = table.pop() row_buffer = table.pop()
is_revert_column: list[bool | None] = [] is_revert_column: list[Union[bool, None]] = []
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
if self.revert_radius == 0 or d: if self.revert_radius == 0 or d:
is_revert_column.append(None) is_revert_column.append(None)