From 2a2b611d798c93d5fec0b0dad60156e0b3da4a50 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 28 May 2025 21:31:41 -0700 Subject: [PATCH] Fix issue with .7z archives Before, only fandom wikis dumps were compressed with .7z. These archives can have several .xml files in the .7z; not just one. So we need to have a flag for the fandom-2020 dumps. This fixes the bug so .7z archives work in either case. --- pyproject.toml | 2 +- test/Wikiq_Unit_Test.py | 12 ++++++------ wikiq | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 142f559..d68de97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "deltas>=0.7.0", - "mw>=0.4.0", + "mediawiki-utilities>=0.4.18", "mwpersistence>=0.2.4", "mwreverts>=0.1.5", "mwxml>=0.3.6", diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index a45e9d9..2e00fd6 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -148,7 +148,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") try: - tester.call_wikiq("--collapse-user") + tester.call_wikiq("--collapse-user --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -162,7 +162,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") try: - tester.call_wikiq("--persistence segment") + tester.call_wikiq("--persistence segment --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -176,7 +176,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") try: - tester.call_wikiq("--persistence legacy") + tester.call_wikiq("--persistence legacy --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -190,7 +190,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") try: - tester.call_wikiq("--persistence") + tester.call_wikiq("--persistence --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -206,7 +206,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") try: - tester.call_wikiq("--url-encode") + tester.call_wikiq("--url-encode --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -233,7 +233,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") try: - outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") + outs = tester.call_wikiq( "--stdout --fandom-2020", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wikiq b/wikiq index 0c7e85c..290225b 100755 --- a/wikiq +++ b/wikiq @@ -704,17 +704,21 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) - -def open_input_file(input_filename): +def match_archive_suffix(input_filename): if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, "*.xml"] + cmd = ["7za", "x", "-so", input_filename] elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] else: raise ValueError("Unrecognized file type: %s" % input_filename) + return cmd +def open_input_file(input_filename, fandom_2020=False): + cmd = match_archive_suffix(input_filename) + if fandom_2020: + cmd.append("*.xml") try: return Popen(cmd, stdout=PIPE).stdout except NameError: @@ -787,6 +791,10 @@ def main(): action='append', help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('--fandom-2020', dest="fandom_2020", + action='store_true', + help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") + args = parser.parse_args() # set persistence method @@ -808,7 +816,7 @@ def main(): if len(args.dumpfiles) > 0: output_parquet = False for filename in args.dumpfiles: - input_file = open_input_file(filename) + input_file = open_input_file(filename, args.fandom_2020) # open directory for output if args.output_dir: