From 3c7fb088d6eb3c35c7e5a14967fc7bdf8a295660 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 20:54:42 -0700 Subject: [PATCH 1/6] fix schema bugs. --- wikiq | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wikiq b/wikiq index a377d19..75c1af8 100755 --- a/wikiq +++ b/wikiq @@ -250,13 +250,13 @@ class RevDataBase(): pa.field("revid", pa.int64()), pa.field("date_time", pa.timestamp('ms')), pa.field("articleid",pa.int64()), - pa.field("editorid",pa.int64()), + pa.field("editorid",pa.int64(), nullable=True), pa.field("title",pa.string()), pa.field("namespace",pa.int32()), pa.field("deleted",pa.bool_()), - pa.field("test_chars",pa.int32()), - pa.field("revert",pa.bool_()), - pa.field("reverteds",pa.list_(pa.int64())), + pa.field("text_chars",pa.int32()), + pa.field("revert",pa.bool_(), nullable=True), + pa.field("reverteds",pa.list_(pa.int64()), nullable=True), pa.field("sha1",pa.string()), pa.field("minor",pa.bool_()), pa.field("editor",pa.string()), @@ -518,7 +518,7 @@ class WikiqParser(): namespace = namespace ) - rev_data = self.matchmake(rev, rev_data) + rev_data = self.matchmake_revision(rev, rev_data) if not rev.deleted.text: # rev.text can be None if the page has no text From 15e9234903749e61454b2c99b20e90b2e8ea4791 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 20:59:55 -0700 Subject: [PATCH 2/6] adding pyproject.toml --- pyproject.toml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6d1e94e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[project] +name = "mediawiki-dump-tools" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "deltas>=0.7.0", + "mw>=0.4.0", + "mwpersistence>=0.2.4", + "mwreverts>=0.1.5", + "mwxml>=0.3.6", + "pyarrow>=20.0.0", + "yamlconf", +] + +[tool.uv.sources] +yamlconf = { git = "https://github.com/groceryheist/yamlconf" } + +[dependency-groups] +dev = [ + "pandas>=2.1.0", + "pytest>=8.3.5", +] From 39fec0820d782891473cd457c2d960a9c70afff2 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 21:13:18 -0700 Subject: [PATCH 3/6] use my version of mwxml since it fixes a bug. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6d1e94e..142f559 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ [tool.uv.sources] yamlconf = { git = "https://github.com/groceryheist/yamlconf" } +mwxml = { git = "https://github.com/groceryheist/python-mwxml" } [dependency-groups] dev = [ From 2a2b611d798c93d5fec0b0dad60156e0b3da4a50 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 21:31:41 -0700 Subject: [PATCH 4/6] Fix issue with .7z archives Before, only fandom wikis dumps were compressed with .7z. These archives can have several .xml files in the .7z; not just one. So we need to have a flag for the fandom-2020 dumps. This fixes the bug so .7z archives work in either case. --- pyproject.toml | 2 +- test/Wikiq_Unit_Test.py | 12 ++++++------ wikiq | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 142f559..d68de97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "deltas>=0.7.0", - "mw>=0.4.0", + "mediawiki-utilities>=0.4.18", "mwpersistence>=0.2.4", "mwreverts>=0.1.5", "mwxml>=0.3.6", diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index a45e9d9..2e00fd6 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -148,7 +148,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") try: - tester.call_wikiq("--collapse-user") + tester.call_wikiq("--collapse-user --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -162,7 +162,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") try: - tester.call_wikiq("--persistence segment") + tester.call_wikiq("--persistence segment --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -176,7 +176,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") try: - tester.call_wikiq("--persistence legacy") + tester.call_wikiq("--persistence legacy --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -190,7 +190,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") try: - tester.call_wikiq("--persistence") + tester.call_wikiq("--persistence --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -206,7 +206,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") try: - tester.call_wikiq("--url-encode") + tester.call_wikiq("--url-encode --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -233,7 +233,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") try: - outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") + outs = tester.call_wikiq( "--stdout --fandom-2020", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wikiq b/wikiq index 0c7e85c..290225b 100755 --- a/wikiq +++ b/wikiq @@ -704,17 +704,21 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) - -def open_input_file(input_filename): +def match_archive_suffix(input_filename): if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, "*.xml"] + cmd = ["7za", "x", "-so", input_filename] elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] else: raise ValueError("Unrecognized file type: %s" % input_filename) + return cmd +def open_input_file(input_filename, fandom_2020=False): + cmd = match_archive_suffix(input_filename) + if fandom_2020: + cmd.append("*.xml") try: return Popen(cmd, stdout=PIPE).stdout except NameError: @@ -787,6 +791,10 @@ def main(): action='append', help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('--fandom-2020', dest="fandom_2020", + action='store_true', + help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") + args = parser.parse_args() # set persistence method @@ -808,7 +816,7 @@ def main(): if len(args.dumpfiles) > 0: output_parquet = False for filename in args.dumpfiles: - input_file = open_input_file(filename) + input_file = open_input_file(filename, args.fandom_2020) # open directory for output if args.output_dir: From b8cdc82fc2d1cb8c48fadd6e23caa0b3a762ab66 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 23:52:37 -0500 Subject: [PATCH 5/6] add ipython for dev --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 142f559..c876057 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "mw>=0.4.0", "mwpersistence>=0.2.4", "mwreverts>=0.1.5", + "mwtypes>=0.4.0", "mwxml>=0.3.6", "pyarrow>=20.0.0", "yamlconf", From ab280dd765ca0e88055dfd01dac8208161e68952 Mon Sep 17 00:00:00 2001 From: Will Beason Date: Thu, 29 May 2025 10:05:49 -0500 Subject: [PATCH 6/6] Remove requirements.txt and add uv.lock to ignored files. We can choose to check in uv.lock later if we want. Signed-off-by: Will Beason --- .gitignore | 6 +++++- requirements.txt | 39 --------------------------------------- 2 files changed, 5 insertions(+), 40 deletions(-) delete mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index c90a397..1ae46ba 100644 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,13 @@ *.xml.xz *.swp +# Lockfiles +uv.lock + # JetBrains /.idea # Python build and test output __pycache__/ -test_output/ +/test/test_output/ +/test/test_output.parquet/ diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a54cca5..0000000 --- a/requirements.txt +++ /dev/null @@ -1,39 +0,0 @@ -attrs==25.3.0 -certifi==2025.4.26 -charset-normalizer==3.4.2 -Cython==0.29.37 -deltas==0.7.0 -docopt==0.6.2 -gnureadline==8.1.2 -idna==3.10 -jsonable==0.3.1 -jsonschema==4.23.0 -jsonschema-specifications==2025.4.1 -mediawiki-utilities==0.4.18 -mwcli==0.0.3 -mwdiffs==0.0.2 -mwpersistence==0.2.4 -mwreverts==0.1.5 -mwtypes==0.4.0 -mwxml==0.3.6 -pandas==2.2.3 -para==0.0.8 -parsimonious==0.10.0 -pyarrow==20.0.0 -pydub==0.25.1 -PyMySQL==1.1.1 -python-dateutil==2.9.0.post0 -pytz==2025.2 -PyYAML==5.4.1 -referencing==0.36.2 -regex==2024.11.6 -requests==2.32.3 -rpds-py==0.25.1 -setuptools==80.8.0 -six==1.17.0 -stopit==1.1.2 -typing_extensions==4.13.2 -tzdata==2025.2 -urllib3==2.4.0 -wheel==0.45.1 -yamlconf==0.2.6