diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 1258724..a45e9d9 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -15,6 +15,9 @@ TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history" +SAILORMOON: Final[str] = "sailormoon" +TWINPEAKS: Final[str] = "twinpeaks" +REGEXTEST: Final[str] = "regextest" def setup(): tracemalloc.start() @@ -22,12 +25,6 @@ def setup(): # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. if not os.path.exists(TEST_OUTPUT_DIR): os.mkdir(TEST_OUTPUT_DIR) - else: - # Avoid subsequent calls to tests interfering with each other. - # Otherwise, a test may erroneously pass if the program has no output - # but a previous run output what was expected. - for f in os.listdir(TEST_OUTPUT_DIR): - os.remove(os.path.join(TEST_OUTPUT_DIR, f)) # Always run setup, even if this is executed via "python -m unittest" rather @@ -38,44 +35,40 @@ setup() class WikiqTester: def __init__(self, wiki: str, - case_name: str, + case_name: str | None = None, + suffix: str | None = None, + in_compression: str = "bz2", out_format: str = "tsv", ): - self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.bz2".format(wiki)) + self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) - self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) + if suffix is None: + self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) + else: + self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) + self.call_output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(wiki, out_format)) - self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) - self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) - if os.path.exists(self.test_file): - os.remove(self.test_file) + # If case_name is unset, there are no relevant baseline or test files. + if case_name is not None: + self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name)) + if os.path.exists(self.test_file): + os.remove(self.test_file) + def call_wikiq(self, *args: str, out: bool = True): + """ + Calls wikiq with the passed arguments on the input file relevant to the test. + :param args: The command line arguments to pass to wikiq. + :param out: Whether to pass an output argument to wikiq. + :return: The output of the wikiq call. + """ + if out: + call = ' '.join([WIKIQ, self.input_file, "-o", TEST_OUTPUT_DIR, *args]) + else: + call = ' '.join([WIKIQ, self.input_file, *args]) -def call_wikiq(input_file: str, *args: str, out: bool = True): - if out: - call = ' '.join([WIKIQ, input_file, "-o", TEST_OUTPUT_DIR, *args]) - else: - call = ' '.join([WIKIQ, input_file, *args]) - - print(call) - return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) - - -def tmp_test_file(name: str) -> (str, str): - """ - Removes any existing test file with the given name and returns the path to - the that file. - :param name: The test case name. Should be unique to each test case. - :return: The path to the test file. - """ - baseline_file = os.path.join(BASELINE_DIR, name) - test_file = os.path.join(TEST_OUTPUT_DIR, name) - if os.path.exists(test_file): - os.remove(test_file) - - return baseline_file, test_file - + print(call) + return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) # with / without pwr DONE # with / without url encode DONE @@ -87,12 +80,12 @@ def tmp_test_file(name: str) -> (str, str): # wikia and wikipedia data DONE # malformed xmls DONE -class TestWikipedia(unittest.TestCase): +class WikiqTestCase(unittest.TestCase): def test_WP_url_encode(self): tester = WikiqTester(IKWIKI, "url-encode") try: - call_wikiq(tester.input_file, "--url-encode") + tester.call_wikiq("--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -107,7 +100,7 @@ class TestWikipedia(unittest.TestCase): tester = WikiqTester(IKWIKI, "namespaces") try: - call_wikiq(tester.input_file, "-n 0", "-n 1") + tester.call_wikiq("-n 0", "-n 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -124,7 +117,7 @@ class TestWikipedia(unittest.TestCase): tester = WikiqTester(IKWIKI, "revert_radius") try: - call_wikiq(tester.input_file, "-n 0", "-n 1", "-rr 1") + tester.call_wikiq("-n 0", "-n 1", "-rr 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) @@ -137,167 +130,124 @@ class TestWikipedia(unittest.TestCase): baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) - -class TestBasic(unittest.TestCase): - - def setUp(self): - wiki = 'sailormoon' - self.wikiq_out_name = wiki + ".tsv" - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - - infile = "{0}.xml.7z".format(wiki) - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - def test_noargs(self): - baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") try: - call_wikiq(self.input_file) + tester.call_wikiq() except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_collapse_user(self): - baseline_file, test_file = tmp_test_file("collapse-user_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") try: - call_wikiq(self.input_file, "--collapse-user") + tester.call_wikiq("--collapse-user") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_segment(self): - baseline_file, test_file = tmp_test_file("persistence_segment_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") try: - call_wikiq(self.input_file, "--persistence segment") + tester.call_wikiq("--persistence segment") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_legacy(self): - baseline_file, test_file = tmp_test_file("persistence_legacy_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") try: - call_wikiq(self.input_file, "--persistence legacy") + tester.call_wikiq("--persistence legacy") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr(self): - baseline_file, test_file = tmp_test_file("persistence_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") try: - call_wikiq(self.input_file, "--persistence") + tester.call_wikiq("--persistence") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) test = test.reindex(columns=sorted(test.columns)) assert_frame_equal(test, baseline, check_like=True) def test_url_encode(self): - baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) + tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z") try: - call_wikiq(self.input_file, "--url-encode") + tester.call_wikiq("--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) - test = pd.read_table(test_file) - baseline = pd.read_table(baseline_file) + copyfile(tester.call_output, tester.test_file) + test = pd.read_table(tester.test_file) + baseline = pd.read_table(tester.baseline_file) test = test.reindex(columns=sorted(test.columns)) assert_frame_equal(test, baseline, check_like=True) - -class TestMalformed(unittest.TestCase): - def setUp(self): - wiki = 'twinpeaks' - - infile = "{0}.xml.7z".format(wiki) - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - def test_malformed_noargs(self): + tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z") want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' try: - call_wikiq(self.input_file) + tester.call_wikiq() except subprocess.CalledProcessError as exc: errlines = exc.stderr.decode("utf8").splitlines() self.assertEqual(errlines[-1], want_exception) else: self.fail("No exception raised, want: {}".format(want_exception)) + def test_stdout_noargs(self): + tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") -class TestStdout(unittest.TestCase): - - def setUp(self): - wiki = 'sailormoon' - self.wikiq_out_name = wiki + ".tsv" - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) - - infile = "{0}.xml.7z".format(wiki) - self.base_call = WIKIQ + " {0} --stdout" - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - - def test_noargs(self): - baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) - - outs = "" try: - outs = call_wikiq(self.input_file, "--stdout", out=False).decode("utf8") + outs = tester.call_wikiq( "--stdout", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) test = pd.read_table(StringIO(outs)) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) - -class TestRegex(unittest.TestCase): - def setUp(self): - wiki = 'regextest' - self.wikiq_out_name = wiki + '.tsv' - infile = "{0}.xml.bz2".format(wiki) - - input_dir = os.path.join(TEST_DIR, "dumps") - self.input_file = os.path.join(TEST_DIR, input_dir, infile) - - self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) + def test_bad_regex(self): + tester = WikiqTester(wiki=REGEXTEST) # sample arguments for checking that bad arguments get terminated / test_regex_arguments - self.bad_arguments_list = [ + bad_arguments_list = [ # label is missing "-RP '\\b\\d+\\b'", # number of reg and number of labels do not match @@ -309,62 +259,59 @@ class TestRegex(unittest.TestCase): "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ] - # sample arguments for checking the outcomes of good arguments / test_basic_regex - self.good_arguments_list = [ - "-RP '\\b\\d{3}\\b' -RPl threedigits", - "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", - "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", - "-CP 'WP:EVADE' -CPl wp_evade" - ] - - self.cap_arguments_list = [ - "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", - "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" - ] - - def test_regex_arguments(self): - for arguments in self.bad_arguments_list: + for arguments in bad_arguments_list: try: - call_wikiq(self.input_file, "--stdout", arguments, out=False) + tester.call_wikiq("--stdout", arguments, out=False) except subprocess.CalledProcessError as exc: # we want to check that the bad arguments were caught and sys.exit is stopping the code print(exc.stderr.decode("utf-8")) else: self.fail("No exception raised, want Exception") - def test_basic_regex(self): - for i, arguments in enumerate(self.good_arguments_list): - test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - baseline_file, test_file = tmp_test_file(test_filename) + def test_good_regex(self): + # sample arguments for checking the outcomes of good arguments / test_basic_regex + good_arguments_list = [ + "-RP '\\b\\d{3}\\b' -RPl threedigits", + "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", + "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", + "-CP 'WP:EVADE' -CPl wp_evade" + ] + + for i, arguments in enumerate(good_arguments_list): + tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) try: - call_wikiq(self.input_file, arguments) + tester.call_wikiq( arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) + test = pd.read_table(tester.test_file) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) print(i) def test_capturegroup_regex(self): - for i, arguments in enumerate(self.cap_arguments_list): - test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) - baseline_file, test_file = tmp_test_file(test_filename) + cap_arguments_list = [ + "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", + "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" + ] + + for i, arguments in enumerate(cap_arguments_list): + tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)) try: - call_wikiq(self.input_file, arguments) + tester.call_wikiq(arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) - copyfile(self.call_output, test_file) + copyfile(tester.call_output, tester.test_file) - test = pd.read_table(test_file) + test = pd.read_table(tester.test_file) - baseline = pd.read_table(baseline_file) + baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True)