diff --git a/.gitignore b/.gitignore index d5257ec..1ae46ba 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ *.xml.xz *.swp +# Lockfiles +uv.lock + # JetBrains /.idea diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..eafb09f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,26 @@ +[project] +name = "mediawiki-dump-tools" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "deltas>=0.7.0", + "mediawiki-utilities>=0.4.18", + "mwpersistence>=0.2.4", + "mwreverts>=0.1.5", + "mwtypes>=0.4.0", + "mwxml>=0.3.6", + "pyarrow>=20.0.0", + "yamlconf", +] + +[tool.uv.sources] +yamlconf = { git = "https://github.com/groceryheist/yamlconf" } +mwxml = { git = "https://github.com/groceryheist/python-mwxml" } + +[dependency-groups] +dev = [ + "pandas>=2.1.0", + "pytest>=8.3.5", +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 635dc23..0000000 --- a/requirements.txt +++ /dev/null @@ -1,40 +0,0 @@ -attrs==25.3.0 -certifi==2025.4.26 -charset-normalizer==3.4.2 -Cython==0.29.37 -deltas==0.7.0 -docopt==0.6.2 -gnureadline==8.1.2 -idna==3.10 -jsonable==0.3.1 -jsonschema==4.23.0 -jsonschema-specifications==2025.4.1 -mediawiki-utilities==0.4.18 -mwcli==0.0.3 -mwdiffs==0.0.2 -mwpersistence==0.2.4 -mwreverts==0.1.5 -mwtypes==0.4.0 -mwxml==0.3.6 -numpy==2.2.6 -pandas==2.2.3 -para==0.0.8 -parsimonious==0.10.0 -pyarrow==20.0.0 -pydub==0.25.1 -PyMySQL==1.1.1 -python-dateutil==2.9.0.post0 -pytz==2025.2 -PyYAML==5.4.1 -referencing==0.36.2 -regex==2024.11.6 -requests==2.32.3 -rpds-py==0.25.1 -setuptools==80.8.0 -six==1.17.0 -stopit==1.1.2 -typing_extensions==4.13.2 -tzdata==2025.2 -urllib3==2.4.0 -wheel==0.45.1 -yamlconf==0.2.6 diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 9ae9da0..18d963d 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -1,4 +1,3 @@ -import math import unittest import os import subprocess @@ -7,12 +6,10 @@ from shutil import copyfile import numpy as np import pandas as pd from pandas import DataFrame -from pandas._testing import assert_series_equal -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal from io import StringIO import tracemalloc from typing import Final -from datetime import datetime # Make references to files and wikiq relative to this file, not to the current working directory. TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) @@ -181,7 +178,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") try: - tester.call_wikiq("--collapse-user") + tester.call_wikiq("--collapse-user", "--fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -195,7 +192,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") try: - tester.call_wikiq("--persistence segment") + tester.call_wikiq("--persistence segment", "--fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -209,7 +206,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") try: - tester.call_wikiq("--persistence legacy") + tester.call_wikiq("--persistence legacy", "--fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -223,7 +220,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") try: - tester.call_wikiq("--persistence") + tester.call_wikiq("--persistence", "--fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -239,7 +236,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") try: - tester.call_wikiq("--url-encode") + tester.call_wikiq("--url-encode", "--fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -266,7 +263,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") try: - outs = tester.call_wikiq("--stdout", out=False).decode("utf8") + outs = tester.call_wikiq( "--stdout", "--fandom-2020", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wikiq b/wikiq index ffd4183..1ed2d8e 100755 --- a/wikiq +++ b/wikiq @@ -266,13 +266,13 @@ class RevDataBase: pa.field("revid", pa.int64()), pa.field("date_time", pa.timestamp('ms')), pa.field("articleid", pa.int64()), - pa.field("editorid", pa.int64()), + pa.field("editorid", pa.int64(), nullable=True), pa.field("title", pa.string()), pa.field("namespace", pa.int32()), pa.field("deleted", pa.bool_()), pa.field("text_chars", pa.int32()), - pa.field("revert", pa.bool_()), - pa.field("reverteds", pa.list_(pa.int64())), + pa.field("revert", pa.bool_(), nullable=True), + pa.field("reverteds", pa.list_(pa.int64()), nullable=True), pa.field("sha1", pa.string()), pa.field("minor", pa.bool_()), pa.field("editor", pa.string()), @@ -280,7 +280,7 @@ class RevDataBase: ] # pyarrow is a columnar format, so most of the work happens in the flush_parquet_buffer function - def to_pyarrow(self) -> tuple[Any, ...]: + def to_pyarrow(self): return dc.astuple(self) # logic to convert each field into the wikiq tsv format goes here. @@ -732,16 +732,22 @@ class WikiqParser: print(line, file=self.output_file) -def open_input_file(input_filename) -> TextIOWrapper | IO[Any] | IO[bytes]: +def match_archive_suffix(input_filename): if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, "*.xml"] + cmd = ["7za", "x", "-so", input_filename] elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] else: raise ValueError("Unrecognized file type: %s" % input_filename) + return cmd + +def open_input_file(input_filename, fandom_2020=False): + cmd = match_archive_suffix(input_filename) + if fandom_2020: + cmd.append("*.xml") try: return Popen(cmd, stdout=PIPE).stdout except NameError: @@ -814,6 +820,10 @@ def main(): action='append', help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('--fandom-2020', dest="fandom_2020", + action='store_true', + help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") + args = parser.parse_args() # set persistence method @@ -835,7 +845,7 @@ def main(): if len(args.dumpfiles) > 0: output_parquet = False for filename in args.dumpfiles: - input_file = open_input_file(filename) + input_file = open_input_file(filename, args.fandom_2020) # open directory for output if args.output_dir: