import unittest import os import subprocess from shutil import copyfile import pandas as pd from pandas.testing import assert_frame_equal from io import StringIO import tracemalloc from typing import Final # Make references to files and wikiq relative to this file, not to the current working directory. TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq") TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output") BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output") def setup(): tracemalloc.start() # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup. if not os.path.exists(TEST_OUTPUT_DIR): os.mkdir(TEST_OUTPUT_DIR) else: # Avoid subsequent calls to tests interfering with each other. # Otherwise, a test may erroneously pass if the program has no output # but a previous run output what was expected. for f in os.listdir(TEST_OUTPUT_DIR): os.remove(os.path.join(TEST_OUTPUT_DIR, f)) # Always run setup, even if this is executed via "python -m unittest" rather # than as __main__. setup() def call_wikiq(input_file: str, *args: str, out: bool = True): if out: call = ' '.join([WIKIQ, input_file, "-o", TEST_OUTPUT_DIR, *args]) else: call = ' '.join([WIKIQ, input_file, *args]) print(call) return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) def tmp_test_file(name: str) -> (str, str): """ Removes any existing test file with the given name and returns the path to the that file. :param name: The test case name. Should be unique to each test case. :return: The path to the test file. """ baseline_file = os.path.join(BASELINE_DIR, name) test_file = os.path.join(TEST_OUTPUT_DIR, name) if os.path.exists(test_file): os.remove(test_file) return baseline_file, test_file # with / without pwr DONE # with / without url encode DONE # with / without collapse user DONE # with output to stdout DONE # note that the persistence radius is 7 by default # reading various file formats including # 7z, gz, bz2, xml DONE # wikia and wikipedia data DONE # malformed xmls DONE class Test_Wikipedia(unittest.TestCase): def setUp(self): wiki = 'ikwiki-20180301-pages-meta-history' self.wikiq_out_name = "{0}.tsv".format(wiki) self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) infile = "{0}.xml.bz2".format(wiki) input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_WP_url_encode(self): baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_WP_namespaces(self): baseline_file, test_file = tmp_test_file("namespaces_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "-n 0", "-n 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) num_wrong_ns = sum(~ test.namespace.isin({0, 1})) self.assertEqual(num_wrong_ns, 0) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_WP_revert_radius(self): baseline_file, test_file = tmp_test_file("revert_radius_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "-n 0", "-n 1", "-rr 1") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) # as a test let's make sure that we get equal data frames test = pd.read_table(test_file) num_wrong_ns = sum(~ test.namespace.isin({0, 1})) self.assertEqual(num_wrong_ns, 0) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) class Test_Basic(unittest.TestCase): def setUp(self): wiki = 'sailormoon' self.wikiq_out_name = wiki + ".tsv" self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) infile = "{0}.xml.7z".format(wiki) input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_noargs(self): baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) try: call_wikiq(self.input_file) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_collapse_user(self): baseline_file, test_file = tmp_test_file("collapse-user_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "--collapse-user") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_segment(self): baseline_file, test_file = tmp_test_file("persistence_segment_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "--persistence segment") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr_legacy(self): baseline_file, test_file = tmp_test_file("persistence_legacy_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "--persistence legacy") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) def test_pwr(self): baseline_file, test_file = tmp_test_file("persistence_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "--persistence") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) test = test.reindex(columns=sorted(test.columns)) assert_frame_equal(test, baseline, check_like=True) def test_url_encode(self): baseline_file, test_file = tmp_test_file("url-encode_" + self.wikiq_out_name) try: call_wikiq(self.input_file, "--url-encode") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) test = test.reindex(columns=sorted(test.columns)) assert_frame_equal(test, baseline, check_like=True) class Test_Malformed(unittest.TestCase): def setUp(self): wiki = 'twinpeaks' infile = "{0}.xml.7z".format(wiki) input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_malformed_noargs(self): want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' try: call_wikiq(self.input_file) except subprocess.CalledProcessError as exc: errlines = exc.stderr.decode("utf8").splitlines() self.assertEqual(errlines[-1], want_exception) else: self.fail("No exception raised, want: {}".format(want_exception)) class Test_Stdout(unittest.TestCase): def setUp(self): wiki = 'sailormoon' self.wikiq_out_name = wiki + ".tsv" self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) infile = "{0}.xml.7z".format(wiki) self.base_call = WIKIQ + " {0} --stdout" input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, input_dir, infile) def test_noargs(self): baseline_file, test_file = tmp_test_file("noargs_" + self.wikiq_out_name) outs = "" try: outs = call_wikiq(self.input_file, "--stdout", out=False).decode("utf8") except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(StringIO(outs)) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) class Test_Regex(unittest.TestCase): def setUp(self): wiki = 'regextest' self.wikiq_out_name = wiki + '.tsv' infile = "{0}.xml.bz2".format(wiki) input_dir = os.path.join(TEST_DIR, "dumps") self.input_file = os.path.join(TEST_DIR, input_dir, infile) self.call_output = os.path.join(TEST_OUTPUT_DIR, self.wikiq_out_name) # sample arguments for checking that bad arguments get terminated / test_regex_arguments self.bad_arguments_list = [ # label is missing "-RP '\\b\\d+\\b'", # number of reg and number of labels do not match "-RP 'NPO V' -RP THE -RPl testlabel", # cp but rp label "-CP '(Tamil|Li)' -RPl testlabel", # regex is missing "-CPl testlabel", "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" ] # sample arguments for checking the outcomes of good arguments / test_basic_regex self.good_arguments_list = [ "-RP '\\b\\d{3}\\b' -RPl threedigits", "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", "-CP 'WP:EVADE' -CPl wp_evade" ] self.cap_arguments_list = [ "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" ] def test_regex_arguments(self): for arguments in self.bad_arguments_list: try: call_wikiq(self.input_file, "--stdout", arguments, out=False) except subprocess.CalledProcessError as exc: # we want to check that the bad arguments were caught and sys.exit is stopping the code print(exc.stderr.decode("utf-8")) else: self.fail("No exception raised, want Exception") def test_basic_regex(self): for i, arguments in enumerate(self.good_arguments_list): test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) baseline_file, test_file = tmp_test_file(test_filename) try: call_wikiq(self.input_file, arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) print(i) def test_capturegroup_regex(self): for i, arguments in enumerate(self.cap_arguments_list): test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i)) baseline_file, test_file = tmp_test_file(test_filename) try: call_wikiq(self.input_file, arguments) except subprocess.CalledProcessError as exc: self.fail(exc.stderr.decode("utf8")) copyfile(self.call_output, test_file) test = pd.read_table(test_file) baseline = pd.read_table(baseline_file) assert_frame_equal(test, baseline, check_like=True) if __name__ == '__main__': unittest.main()