Pin to python 3.9
Since our execution environment requires this Signed-off-by: Will Beason <willbeason@gmail.com>
This commit is contained in:
parent
84d464ea38
commit
390499dd90
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
||||
3.9
|
@ -3,7 +3,7 @@ name = "mediawiki-dump-tools"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
requires-python = "~=3.9"
|
||||
dependencies = [
|
||||
"deltas>=0.7.0",
|
||||
"mediawiki-utilities>=0.4.18",
|
||||
@ -12,7 +12,7 @@ dependencies = [
|
||||
"mwtypes>=0.4.0",
|
||||
"mwxml>=0.3.6",
|
||||
"pyarrow>=20.0.0",
|
||||
"yamlconf",
|
||||
"yamlconf>=0.2.6",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
|
24
tables.py
24
tables.py
@ -2,7 +2,7 @@ import sys
|
||||
from abc import abstractmethod, ABC
|
||||
from datetime import datetime, timezone
|
||||
from hashlib import sha1
|
||||
from typing import Generic, TypeVar
|
||||
from typing import Generic, TypeVar, Union
|
||||
|
||||
import mwreverts
|
||||
import mwtypes
|
||||
@ -89,10 +89,10 @@ class RevisionArticleId(RevisionField[int]):
|
||||
return page.id
|
||||
|
||||
|
||||
class RevisionEditorId(RevisionField[int | None]):
|
||||
class RevisionEditorId(RevisionField[Union[int, None]]):
|
||||
field = pa.field("editorid", pa.int64(), nullable=True)
|
||||
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None:
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
|
||||
revision = revisions[-1]
|
||||
if revision.deleted.user:
|
||||
return None
|
||||
@ -100,10 +100,10 @@ class RevisionEditorId(RevisionField[int | None]):
|
||||
return revision.user.id
|
||||
|
||||
|
||||
class RevisionIsAnon(RevisionField[bool | None]):
|
||||
class RevisionIsAnon(RevisionField[Union[bool, None]]):
|
||||
field = pa.field("anon", pa.bool_(), nullable=True)
|
||||
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> bool | None:
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[bool, None]:
|
||||
revision = revisions[-1]
|
||||
if revision.deleted.user:
|
||||
return None
|
||||
@ -111,10 +111,10 @@ class RevisionIsAnon(RevisionField[bool | None]):
|
||||
return revision.user.id is None
|
||||
|
||||
|
||||
class RevisionEditorText(RevisionField[str | None]):
|
||||
class RevisionEditorText(RevisionField[Union[str, None]]):
|
||||
field = pa.field("editor", pa.string(), nullable=True)
|
||||
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None:
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
|
||||
revision = revisions[-1]
|
||||
if revision.deleted.user:
|
||||
return None
|
||||
@ -152,10 +152,10 @@ class RevisionSha1(RevisionField[str]):
|
||||
return revision.sha1
|
||||
|
||||
|
||||
class RevisionTextChars(RevisionField[int | None]):
|
||||
class RevisionTextChars(RevisionField[Union[int, None]]):
|
||||
field = pa.field("text_chars", pa.int32(), nullable=True)
|
||||
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int | None:
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[int, None]:
|
||||
revision = revisions[-1]
|
||||
if not revision.deleted.text:
|
||||
return len(revision.text)
|
||||
@ -179,14 +179,14 @@ class RevisionIsMinor(RevisionField[bool]):
|
||||
return revision.minor
|
||||
|
||||
|
||||
class RevisionReverts(RevisionField[str | None]):
|
||||
class RevisionReverts(RevisionField[Union[str, None]]):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.rev_detector: mwreverts.Detector | None = None
|
||||
self.rev_detector: Union[mwreverts.Detector, None] = None
|
||||
|
||||
field = pa.field("reverteds", pa.string(), nullable=True)
|
||||
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str | None:
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
|
||||
if self.rev_detector is None:
|
||||
return None
|
||||
|
||||
|
12
wikiq
12
wikiq
@ -14,7 +14,7 @@ from itertools import groupby
|
||||
from subprocess import Popen, PIPE
|
||||
from collections import deque
|
||||
from hashlib import sha1
|
||||
from typing import Any, IO, TextIO, Generator
|
||||
from typing import Any, IO, TextIO, Generator, Union
|
||||
|
||||
import mwxml
|
||||
from mwxml import Dump
|
||||
@ -100,7 +100,7 @@ class WikiqPage:
|
||||
self.__revisions: Generator[list[mwxml.Revision]] = self.rev_list()
|
||||
|
||||
@staticmethod
|
||||
def user_text(rev) -> str | None:
|
||||
def user_text(rev) -> Union[str, None]:
|
||||
return None if rev.deleted.user else rev.user.text
|
||||
|
||||
def rev_list(self):
|
||||
@ -203,15 +203,15 @@ class RegexPair(object):
|
||||
|
||||
class WikiqParser:
|
||||
def __init__(self,
|
||||
input_file: TextIOWrapper | IO[Any] | IO[bytes],
|
||||
output_file: TextIO | str,
|
||||
input_file: Union[TextIOWrapper, IO[Any], IO[bytes]],
|
||||
output_file: Union[TextIO, str],
|
||||
regex_match_revision: list[str],
|
||||
regex_match_comment: list[str],
|
||||
regex_revision_label: list[str],
|
||||
regex_comment_label: list[str],
|
||||
collapse_user: bool = False,
|
||||
persist: int = None,
|
||||
namespaces: list[int] | None = None,
|
||||
namespaces: Union[list[int], None] = None,
|
||||
revert_radius: int = 15,
|
||||
output_parquet: bool = True,
|
||||
parquet_buffer_size: int = 2000):
|
||||
@ -410,7 +410,7 @@ class WikiqParser:
|
||||
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
|
||||
row_buffer = table.pop()
|
||||
|
||||
is_revert_column: list[bool | None] = []
|
||||
is_revert_column: list[Union[bool, None]] = []
|
||||
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
|
||||
if self.revert_radius == 0 or d:
|
||||
is_revert_column.append(None)
|
||||
|
Loading…
Reference in New Issue
Block a user