diff --git a/pyproject.toml b/pyproject.toml index c876057..eafb09f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "deltas>=0.7.0", - "mw>=0.4.0", + "mediawiki-utilities>=0.4.18", "mwpersistence>=0.2.4", "mwreverts>=0.1.5", "mwtypes>=0.4.0", diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index a45e9d9..2e00fd6 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -148,7 +148,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") try: - tester.call_wikiq("--collapse-user") + tester.call_wikiq("--collapse-user --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -162,7 +162,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") try: - tester.call_wikiq("--persistence segment") + tester.call_wikiq("--persistence segment --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -176,7 +176,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") try: - tester.call_wikiq("--persistence legacy") + tester.call_wikiq("--persistence legacy --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -190,7 +190,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") try: - tester.call_wikiq("--persistence") + tester.call_wikiq("--persistence --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -206,7 +206,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") try: - tester.call_wikiq("--url-encode") + tester.call_wikiq("--url-encode --fandom-2020") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -233,7 +233,7 @@ class WikiqTestCase(unittest.TestCase): tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") try: - outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") + outs = tester.call_wikiq( "--stdout --fandom-2020", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) diff --git a/wikiq b/wikiq index 0c7e85c..290225b 100755 --- a/wikiq +++ b/wikiq @@ -704,17 +704,21 @@ class WikiqParser: line = rev_data.to_tsv_row() print(line, file=self.output_file) - -def open_input_file(input_filename): +def match_archive_suffix(input_filename): if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, "*.xml"] + cmd = ["7za", "x", "-so", input_filename] elif re.match(r'.*\.gz$', input_filename): cmd = ["zcat", input_filename] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] else: raise ValueError("Unrecognized file type: %s" % input_filename) + return cmd +def open_input_file(input_filename, fandom_2020=False): + cmd = match_archive_suffix(input_filename) + if fandom_2020: + cmd.append("*.xml") try: return Popen(cmd, stdout=PIPE).stdout except NameError: @@ -787,6 +791,10 @@ def main(): action='append', help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('--fandom-2020', dest="fandom_2020", + action='store_true', + help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") + args = parser.parse_args() # set persistence method @@ -808,7 +816,7 @@ def main(): if len(args.dumpfiles) > 0: output_parquet = False for filename in args.dumpfiles: - input_file = open_input_file(filename) + input_file = open_input_file(filename, args.fandom_2020) # open directory for output if args.output_dir: