import os import shutil import subprocess from typing import Final, Union TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) WIKIQ: Final[str] = os.path.join(os.path.join(TEST_DIR, ".."), "src/wikiq/__init__.py") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history" SAILORMOON: Final[str] = "sailormoon" TWINPEAKS: Final[str] = "twinpeaks" REGEXTEST: Final[str] = "regextest" class WikiqTester: def __init__( self, wiki: str, case_name: str, suffix: Union[str, None] = None, in_compression: str = "bz2", baseline_format: str = "tsv", out_format: str = "tsv", ): self.input_file = os.path.join( TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression) ) basename = "{0}_{1}".format(case_name, wiki) if suffix: basename = "{0}_{1}".format(basename, suffix) self.output = os.path.join( TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format) ) if os.path.exists(self.output): if os.path.isfile(self.output): os.remove(self.output) else: shutil.rmtree(self.output) # Also clean up resume-related files for temp_suffix in [".resume_temp", ".checkpoint", ".merged"]: temp_path = self.output + temp_suffix if os.path.exists(temp_path): if os.path.isfile(temp_path): os.remove(temp_path) else: shutil.rmtree(temp_path) # For JSONL and Parquet, self.output is a file path. Create parent directory if needed. if out_format in ("jsonl", "parquet"): parent_dir = os.path.dirname(self.output) if parent_dir: os.makedirs(parent_dir, exist_ok=True) if suffix is None: self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) else: self.wikiq_baseline_name = "{0}_{1}.{2}".format( wiki, suffix, baseline_format ) self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) if case_name is not None: self.baseline_file = os.path.join( BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name) ) def call_wikiq(self, *args: str, out: bool = True): """ Calls wikiq with the passed arguments on the input file relevant to the test. :param args: The command line arguments to pass to wikiq. :param out: Whether to pass an output argument to wikiq. :return: The output of the wikiq call. """ if out: call = " ".join([WIKIQ, self.input_file, "-o", self.output, "--batch-size", "10", *args]) else: call = " ".join([WIKIQ, self.input_file, "--batch-size", "10", *args]) print(call) return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)