Merge branch 'parquet_support' of gitea:collective/mediawiki_dump_tools into parquet_support

2025-05-28 21:09:13 -07:00
parent 15e9234903 df0ad1de63
commit 383ee03250
6 changed files with 492 additions and 487 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,10 @@
 *.xml.bz2
 *.xml.xz
 *.swp
+
+# JetBrains
+/.idea
+
+# Python build and test output
+__pycache__/
+test_output/
--- a/README.rst
+++ b/README.rst
@@ -12,6 +12,19 @@ submodule like::
 Wikimedia dumps are usually in a compressed format such as 7z (most common), gz, or bz2. Wikiq uses your computer's compression software to read these files. Therefore wikiq depends on
 `7za`, `gzcat`, and `zcat`. 

+Dependencies
+----------------
+These non-Python dependencies must be installed on your system for wikiq and its
+associated tests to work.
+
+- 7zip
+- ffmpeg
+
+Tests
+----
+To run tests::
+
+   python -m unittest test.Wikiq_Unit_Test

 TODO:
 _______________
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,39 @@
+attrs==25.3.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+Cython==0.29.37
+deltas==0.7.0
+docopt==0.6.2
+gnureadline==8.1.2
+idna==3.10
+jsonable==0.3.1
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+mediawiki-utilities==0.4.18
+mwcli==0.0.3
+mwdiffs==0.0.2
+mwpersistence==0.2.4
+mwreverts==0.1.5
+mwtypes==0.4.0
+mwxml==0.3.6
+pandas==2.2.3
+para==0.0.8
+parsimonious==0.10.0
+pyarrow==20.0.0
+pydub==0.25.1
+PyMySQL==1.1.1
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==5.4.1
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rpds-py==0.25.1
+setuptools==80.8.0
+six==1.17.0
+stopit==1.1.2
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+wheel==0.45.1
+yamlconf==0.2.6
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -5,312 +5,249 @@ from shutil import copyfile
 import pandas as pd
 from pandas.testing import assert_frame_equal
 from io import StringIO
+import tracemalloc
+from typing import Final
+
+# Make references to files and wikiq relative to this file, not to the current working directory.
+TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
+WIKIQ: Final[str] = os.path.join(os.path.dirname(TEST_DIR), "wikiq")
+TEST_OUTPUT_DIR: Final[str] = os.path.join(TEST_DIR, "test_output")
+BASELINE_DIR: Final[str] = os.path.join(TEST_DIR, "baseline_output")
+
+IKWIKI: Final[str] = "ikwiki-20180301-pages-meta-history"
+SAILORMOON: Final[str] = "sailormoon"
+TWINPEAKS: Final[str] = "twinpeaks"
+REGEXTEST: Final[str] = "regextest"
+
+def setup():
+    tracemalloc.start()
+
+    # Perform directory check and reset here as this is a one-time setup step as opposed to per-test setup.
+    if not os.path.exists(TEST_OUTPUT_DIR):
+        os.mkdir(TEST_OUTPUT_DIR)
+
+
+# Always run setup, even if this is executed via "python -m unittest" rather
+# than as __main__.
+setup()
+
+
+class WikiqTester:
+    def __init__(self,
+                 wiki: str,
+                 case_name: str | None = None,
+                 suffix: str | None = None,
+                 in_compression: str = "bz2",
+                 out_format: str = "tsv",
+                 ):
+        self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
+
+        if suffix is None:
+            self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
+        else:
+            self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
+        self.call_output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(wiki, out_format))
+
+        # If case_name is unset, there are no relevant baseline or test files.
+        if case_name is not None:
+            self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
+            self.test_file = os.path.join(TEST_OUTPUT_DIR, "{0}_{1}".format(case_name, self.wikiq_out_name))
+            if os.path.exists(self.test_file):
+                os.remove(self.test_file)
+
+    def call_wikiq(self, *args: str, out: bool = True):
+        """
+        Calls wikiq with the passed arguments on the input file relevant to the test.
+        :param args: The command line arguments to pass to wikiq.
+        :param out: Whether to pass an output argument to wikiq.
+        :return: The output of the wikiq call.
+        """
+        if out:
+            call = ' '.join([WIKIQ, self.input_file, "-o", TEST_OUTPUT_DIR, *args])
+        else:
+            call = ' '.join([WIKIQ, self.input_file, *args])
+
+        print(call)
+        return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)

 # with / without pwr DONE
 # with / without url encode DONE
 # with / without collapse user DONE
-# with output to sdtout DONE
+# with output to stdout DONE
 # note that the persistence radius is 7 by default
 # reading various file formats including
 #        7z, gz, bz2, xml  DONE
 # wikia and wikipedia data DONE
 # malformed xmls DONE

-class Test_Wikipedia(unittest.TestCase):
-    def setUp(self):
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.wiki = 'ikwiki-20180301-pages-meta-history'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
-        self.infile = "{0}.xml.bz2".format(self.wiki)    
-        self.base_call = "../wikiq {0} -o {1}"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-        self.baseline_output_dir = "baseline_output"
-
+class WikiqTestCase(unittest.TestCase):
    def test_WP_url_encode(self):
-        test_filename =  "url-encode_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        tester = WikiqTester(IKWIKI, "url-encode")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --url-encode"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
+        try:
+            tester.call_wikiq("--url-encode")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
+        copyfile(tester.call_output, tester.test_file)

        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

    def test_WP_namespaces(self):
-        print(os.path.abspath('.'))
-        test_filename =  "namespaces_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        tester = WikiqTester(IKWIKI, "namespaces")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " -n 0 -n 1"
-        print(call)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
+        try:
+            tester.call_wikiq("-n 0", "-n 1")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))
+
+        copyfile(tester.call_output, tester.test_file)

        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(test_file)
+        test = pd.read_table(tester.test_file)
        num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
        self.assertEqual(num_wrong_ns, 0)
-        baseline = pd.read_table(baseline_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

    def test_WP_revert_radius(self):
-        print(os.path.abspath('.'))
-        test_filename =  "revert_radius_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        tester = WikiqTester(IKWIKI, "revert_radius")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " -n 0 -n 1 -rr 1"
-        print(call)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(os.path.abspath("."), self.baseline_output_dir, test_filename)
+        try:
+            tester.call_wikiq("-n 0", "-n 1", "-rr 1")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))
+
+        copyfile(tester.call_output, tester.test_file)

        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(test_file)
+        test = pd.read_table(tester.test_file)
        num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
        self.assertEqual(num_wrong_ns, 0)
-        baseline = pd.read_table(baseline_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

-
-
-class Test_Basic(unittest.TestCase):
-
-    def setUp(self):
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.wiki = 'sailormoon'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-
-        self.infile = "{0}.xml.7z".format(self.wiki)
-        self.base_call = "../wikiq {0} -o {1}"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-        self.baseline_output_dir = "baseline_output"
-
    def test_noargs(self):
+        tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")

-        test_filename =  "noargs_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        try:
+            tester.call_wikiq()
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
+        copyfile(tester.call_output, tester.test_file)

-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

-
    def test_collapse_user(self):
-        test_filename =  "collapse-user_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --collapse-user"
+        try:
+            tester.call_wikiq("--collapse-user")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
+        copyfile(tester.call_output, tester.test_file)

-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

    def test_pwr_segment(self):
-        test_filename =  "persistence_segment_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence segment"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
+        try:
+            tester.call_wikiq("--persistence segment")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

+        copyfile(tester.call_output, tester.test_file)

-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

    def test_pwr_legacy(self):
-        test_filename =  "persistence_legacy_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence legacy"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
+        try:
+            tester.call_wikiq("--persistence legacy")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

+        copyfile(tester.call_output, tester.test_file)

-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

    def test_pwr(self):
-        test_filename =  "persistence_" + self.wikiq_out_name
-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file): 
-           os.remove(test_file)
+        tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --persistence"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
+        try:
+            tester.call_wikiq("--persistence")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

+        copyfile(tester.call_output, tester.test_file)

-        copyfile(self.call_output, test_file)
-
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)

        test = test.reindex(columns=sorted(test.columns))
        assert_frame_equal(test, baseline, check_like=True)

-
    def test_url_encode(self):
-        test_filename =  "url-encode_" + self.wikiq_out_name
+        tester = WikiqTester(SAILORMOON, "url-encode", in_compression="7z")

-        test_file = os.path.join(self.test_output_dir, test_filename)
-        if os.path.exists(test_file):
-            os.remove(test_file)
+        try:
+            tester.call_wikiq("--url-encode")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        call = call + " --url-encode"
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,shell=True)
-        proc.wait()
-
-        copyfile(self.call_output, test_file)
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-        test = pd.read_table(test_file)
-        baseline = pd.read_table(baseline_file)
+        copyfile(tester.call_output, tester.test_file)
+        test = pd.read_table(tester.test_file)
+        baseline = pd.read_table(tester.baseline_file)

        test = test.reindex(columns=sorted(test.columns))
        assert_frame_equal(test, baseline, check_like=True)

-
-class Test_Malformed(unittest.TestCase):
-    def setUp(self):
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.wiki = 'twinpeaks'
-        self.wikiq_out_name =  self.wiki + ".tsv"
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name) 
-
-        self.infile = "{0}.xml.7z".format(self.wiki)
-        self.base_call = "../wikiq {0} -o {1}"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-
-
    def test_malformed_noargs(self):
+        tester = WikiqTester(wiki=TWINPEAKS, in_compression="7z")
+        want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'

-        call = self.base_call.format(self.input_file, self.test_output_dir)
-        proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
-        proc.wait()
-        outs, errs = proc.communicate()
-        errlines = str(errs).split("\\n")
-        self.assertEqual(errlines[-2],'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0')
+        try:
+            tester.call_wikiq()
+        except subprocess.CalledProcessError as exc:
+            errlines = exc.stderr.decode("utf8").splitlines()
+            self.assertEqual(errlines[-1], want_exception)
+        else:
+            self.fail("No exception raised, want: {}".format(want_exception))

-class Test_Stdout(unittest.TestCase):
+    def test_stdout_noargs(self):
+        tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")

-    def setUp(self):
-        self.wiki = 'sailormoon'
-        self.wikiq_out_name =  self.wiki + ".tsv"
+        try:
+            outs = tester.call_wikiq( "--stdout", out=False).decode("utf8")
+        except subprocess.CalledProcessError as exc:
+            self.fail(exc.stderr.decode("utf8"))

-        self.infile = "{0}.xml.7z".format(self.wiki)
-        self.base_call = "../wikiq {0} --stdout"
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-        self.baseline_output_dir = "baseline_output"
+        copyfile(tester.call_output, tester.test_file)

-    def test_noargs(self):
-
-        call = self.base_call.format(self.input_file)
-        print(call)
-        proc = subprocess.run(call,stdout=subprocess.PIPE,shell=True)
-        outs = proc.stdout.decode("utf8")
-
-        test_file = "noargs_" + self.wikiq_out_name
-        baseline_file = os.path.join(".", self.baseline_output_dir, test_file)
-        print(baseline_file)
        test = pd.read_table(StringIO(outs))
-        baseline = pd.read_table(baseline_file)
+        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

-class Test_Regex(unittest.TestCase):
+    def test_bad_regex(self):
+        tester = WikiqTester(wiki=REGEXTEST)

-    def setUp(self):
-        self.wiki = 'regextest'
-        self.wikiq_out_name = self.wiki + '.tsv'
-        self.infile = "{0}.xml.bz2".format(self.wiki)
-
-        self.input_dir = "dumps"
-        self.input_file = os.path.join(".", self.input_dir,self.infile)
-
-        if not os.path.exists("test_output"):
-            os.mkdir("test_output")
-
-        self.test_output_dir = os.path.join(".", "test_output")
-        self.call_output = os.path.join(self.test_output_dir, self.wikiq_out_name)
-        # we have two base calls, one for checking inputs and the other for checking outputs
-        self.base_call = "../wikiq {0}"
-        self.base_call_outs = "../wikiq {0} -o {1}"
-
-        self.baseline_output_dir = "baseline_output"
-
-        # sample inputs for checking that bad inputs get terminated / test_regex_inputs
-        self.bad_inputs_list = [
+        # sample arguments for checking that bad arguments get terminated / test_regex_arguments
+        bad_arguments_list = [
            # label is missing
            "-RP '\\b\\d+\\b'",
            # number of reg and number of labels do not match
@@ -322,81 +259,59 @@ class Test_Regex(unittest.TestCase):
            "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
        ]

-        # sample inputs for checking the outcomes of good inputs / test_basic_regex
-        self.good_inputs_list = [
+        for arguments in bad_arguments_list:
+            try:
+                tester.call_wikiq("--stdout", arguments, out=False)
+            except subprocess.CalledProcessError as exc:
+                # we want to check that the bad arguments were caught and sys.exit is stopping the code
+                print(exc.stderr.decode("utf-8"))
+            else:
+                self.fail("No exception raised, want Exception")
+
+    def test_good_regex(self):
+        # sample arguments for checking the outcomes of good arguments / test_basic_regex
+        good_arguments_list = [
            "-RP '\\b\\d{3}\\b' -RPl threedigits",
            "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
            "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
            "-CP 'WP:EVADE' -CPl wp_evade"
        ]

+        for i, arguments in enumerate(good_arguments_list):
+            tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i))

-        self.cap_inputs_list = [
+            try:
+                tester.call_wikiq( arguments)
+            except subprocess.CalledProcessError as exc:
+                self.fail(exc.stderr.decode("utf8"))
+
+            copyfile(tester.call_output, tester.test_file)
+
+            test = pd.read_table(tester.test_file)
+
+            baseline = pd.read_table(tester.baseline_file)
+            assert_frame_equal(test, baseline, check_like=True)
+            print(i)
+
+    def test_capturegroup_regex(self):
+        cap_arguments_list = [
            "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
            "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
        ]

+        for i, arguments in enumerate(cap_arguments_list):
+            tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i))

+            try:
+                tester.call_wikiq(arguments)
+            except subprocess.CalledProcessError as exc:
+                self.fail(exc.stderr.decode("utf8"))

-    def test_regex_inputs(self):
-        for input in self.bad_inputs_list:
-            call = self.base_call.format(self.input_file)
-            call = call + " --stdout " + input
-            print(call)
-            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
-            stdout,stderr = proc.communicate()
-            #print(proc.returncode)
+            copyfile(tester.call_output, tester.test_file)

-            # we want to check that the bad inputs were caught and sys.exit is stopping the code
-            print(stderr.decode("utf-8"))
-            self.assertNotEqual(proc.returncode,0)
+            test = pd.read_table(tester.test_file)

-    def test_basic_regex(self):
-        for i, input in enumerate(self.good_inputs_list):
-
-            test_filename = "basic_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
-            #print(test_filename)
-            test_file = os.path.join(self.test_output_dir, test_filename)
-            if os.path.exists(test_file):
-                os.remove(test_file)
-
-            call = self.base_call_outs.format(self.input_file, self.test_output_dir)
-            call = call + " " + input
-            print(call)
-
-            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
-            proc.wait()
-            copyfile(self.call_output, test_file)
-
-            test = pd.read_table(test_file)
-            
-            baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-            baseline = pd.read_table(baseline_file)
-            assert_frame_equal(test, baseline, check_like=True)
-            print(i)
-
-
-    def test_capturegroup_regex(self):
-        for i, input in enumerate(self.cap_inputs_list):
-            test_filename = "capturegroup_{0}_{1}.tsv".format(self.wikiq_out_name[:-4], str(i))
-            print(test_filename)
-            test_file = os.path.join(self.test_output_dir, test_filename)
-            if os.path.exists(test_file):
-                os.remove(test_file)
-
-            call = self.base_call_outs.format(self.input_file, self.test_output_dir)
-            call = call + " " + input
-            print(call)
-
-            proc = subprocess.Popen(call,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
-            proc.wait()
-
-            copyfile(self.call_output, test_file)
-            
-            test = pd.read_table(test_file)
-            
-            baseline_file = os.path.join(".", self.baseline_output_dir, test_filename)
-            baseline = pd.read_table(baseline_file)
+            baseline = pd.read_table(tester.baseline_file)
            assert_frame_equal(test, baseline, check_like=True)


--- a/test/init.py
+++ b/test/init.py
--- a/97
+++ b/97
@@ -6,7 +6,7 @@

 import argparse
 import sys
-import os, os.path
+import os.path
 import re
 from datetime import datetime, timezone

@@ -20,6 +20,7 @@ from deltas.tokenizers import wikitext_split
 import mwpersistence
 import mwreverts
 from urllib.parse import quote
+
 TO_ENCODE = ('title', 'editor')
 PERSISTENCE_RADIUS = 7
 from deltas import SequenceMatcher
@@ -30,17 +31,20 @@ from dataclasses import dataclass
 import pyarrow as pa
 import pyarrow.parquet as pq

+
 class PersistMethod:
    none = 0
    sequence = 1
    segment = 2
    legacy = 3

+
 def calculate_persistence(tokens_added):
    return (sum([(len(x.revisions) - 1) for x in tokens_added]),
            len(tokens_added))

-class WikiqIterator():
+
+class WikiqIterator:
    def __init__(self, fh, collapse_user=False):
        self.fh = fh
        self.collapse_user = collapse_user
@@ -59,9 +63,10 @@ class WikiqIterator():
        return self.__pages

    def __next__(self):
-        return next(self._pages)
+        return next(self.__pages)

-class WikiqPage():
+
+class WikiqPage:
    __slots__ = ('id', 'title', 'namespace', 'redirect',
                 'restrictions', 'mwpage', '__revisions',
                 'collapse_user')
@@ -92,6 +97,7 @@ class WikiqPage():
        #         3          A               B            True
        #         4          A               A           False
        # Post-loop                          A          Always
+        collapsed_revs = 0
        for i, rev in enumerate(self.mwpage):
            # never yield the first time
            if i == 0:
@@ -138,6 +144,8 @@ A RegexPair is defined by a regular expression (pattern) and a label.
 The pattern can include capture groups.  If it does then each capture group will have a resulting column in the output.
 If the pattern does not include a capture group, then only one output column will result.
 """
+
+
 class RegexPair(object):
    def __init__(self, pattern, label):
        self.pattern = re.compile(pattern)
@@ -156,7 +164,7 @@ class RegexPair(object):
        return fields

    def _make_key(self, cap_group):
-        return ("{}_{}".format(self.label, cap_group))
+        return "{}_{}".format(self.label, cap_group)

    def matchmake(self, content, rev_data):

@@ -174,7 +182,7 @@ class RegexPair(object):
                    temp_list = []
                    for match in matchobjects:
                        # we only want to add the match for the capture group if the match is not None
-                        if match.group(cap_group) != None:
+                        if match.group(cap_group) is not None:
                            temp_list.append(match.group(cap_group))

                    # if temp_list of matches is empty just make that column None
@@ -206,6 +214,7 @@ class RegexPair(object):

        return rev_data

+
 """

 We used to use a dictionary to collect fields for the output. 
@@ -222,8 +231,10 @@ It also needs to have the correct pyarrow schema so we can write parquet files.

 The RevDataBase type has all the fields that will be output no matter how wikiq is invoked.
 """
+
+
@dataclass()
-class RevDataBase():
+class RevDataBase:
    revid: int
    date_time: datetime
    articleid: int
@@ -304,6 +315,7 @@ class RevDataBase():
    def header_row(self):
        return '\t'.join(map(lambda f: f.name, dc.fields(self)))

+
 """

 If collapse=True we'll use a RevDataCollapse dataclass.
@@ -312,6 +324,8 @@ This class inherits from RevDataBase. This means that it has all the same fields
 It just adds a new field and updates the pyarrow schema.

 """
+
+
@dataclass()
 class RevDataCollapse(RevDataBase):
    collapsed_revs: int = None
@@ -319,11 +333,14 @@ class RevDataCollapse(RevDataBase):
    pa_collapsed_revs_schema = pa.field('collapsed_revs', pa.int64())
    pa_schema_fields = RevDataBase.pa_schema_fields + [pa_collapsed_revs_schema]

+
 """

 If persistence data is to be computed we'll need the fields added by RevDataPersistence. 

 """
+
+
@dataclass()
 class RevDataPersistence(RevDataBase):
    token_revs: int = None
@@ -339,16 +356,22 @@ class RevDataPersistence(RevDataBase):

    pa_schema_fields = RevDataBase.pa_schema_fields + pa_persistence_schema_fields

-"""
-class RevDataCollapsePersistence uses multiple inheritence to make a class that has both persistence and collapse fields.

 """
+class RevDataCollapsePersistence uses multiple inheritance to make a class that has both persistence and collapse fields.
+
+"""
+
+
@dataclass()
 class RevDataCollapsePersistence(RevDataCollapse, RevDataPersistence):
    pa_schema_fields = RevDataCollapse.pa_schema_fields + RevDataPersistence.pa_persistence_schema_fields

-class WikiqParser():
-    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label, regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces = None, revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
+
+class WikiqParser:
+    def __init__(self, input_file, output_file, regex_match_revision, regex_match_comment, regex_revision_label,
+                 regex_comment_label, collapse_user=False, persist=None, urlencode=False, namespaces=None,
+                 revert_radius=15, output_parquet=True, parquet_buffer_size=2000):
        """ 
        Parameters:
           persist : what persistence method to use. Takes a PersistMethod value
@@ -370,7 +393,6 @@ class WikiqParser():
        self.regex_revision_pairs = self.make_matchmake_pairs(regex_match_revision, regex_revision_label)
        self.regex_comment_pairs = self.make_matchmake_pairs(regex_match_comment, regex_comment_label)

-
        # This is where we set the type for revdata.

        if self.collapse_user is True:
@@ -424,7 +446,7 @@ class WikiqParser():
                result.append(rp)
                self.regex_schemas = self.regex_schemas + rp.get_pyarrow_fields()
            return result
-        elif (patterns is None and labels is None):
+        elif (patterns is None) and (labels is None):
            return []
        else:
            sys.exit('Each regular expression *must* come with a corresponding label and vice versa.')
@@ -450,7 +472,7 @@ class WikiqParser():

        for ns in self.namespaces:
            # skip if the namespace is not defined
-            if ns == None:
+            if ns is None:
                default_ns = self.namespaces[ns]
                continue

@@ -460,7 +482,6 @@ class WikiqParser():
        # if we've made it this far with no matches, we return the default namespace
        return default_ns

-
    def process(self):

        # create a regex that creates the output filename
@@ -471,13 +492,12 @@ class WikiqParser():
        # Construct dump file iterator
        dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)

-        # extract list of namspaces
+        # extract list of namespaces
        self.namespaces = {ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces}

        page_count = 0
        rev_count = 0

-
        # Iterate through pages
        for page in dump:
            namespace = page.namespace if page.namespace is not None else self.__get_namespace_from_title(page.title)
@@ -560,7 +580,7 @@ class WikiqParser():

                # TODO missing: additions_size deletions_size

-                # if collapse user was on, lets run that
+                # if collapse user was on, let's run that
                if self.collapse_user:
                    rev_data.collapsed_revs = rev.collapsed_revs

@@ -622,11 +642,11 @@ class WikiqParser():
        else:
            self.output_file.close()

-
    """
    For performance reasons it's better to write parquet in batches instead of one row at a time.
    So this function just puts the data on a buffer. If the buffer is full, then it gets flushed (written).
    """
+
    def write_parquet_row(self, rev_data):
        padata = rev_data.to_pyarrow()
        self.parquet_buffer.append(padata)
@@ -634,16 +654,17 @@ class WikiqParser():
        if len(self.parquet_buffer) >= self.parquet_buffer_size:
            self.flush_parquet_buffer()

-
    """
    Function that actually writes data to the parquet file. 
    It needs to transpose the data from row-by-row to column-by-column
    """
+
    def flush_parquet_buffer(self):

        """
        Returns the pyarrow table that we'll write
        """
+
        def rows_to_table(rg, schema):
            cols = []
            first = rg[0]
@@ -661,7 +682,7 @@ class WikiqParser():

        outtable = rows_to_table(self.parquet_buffer, self.schema)
        if self.pq_writer is None:
-            self.pq_writer = pq.ParquetWriter(self.output_file, schema, flavor='spark')
+            self.pq_writer = pq.ParquetWriter(self.output_file, self.schema, flavor='spark')

        self.pq_writer.write_table(outtable)
        self.parquet_buffer = []
@@ -691,13 +712,14 @@ def open_input_file(input_filename):
        cmd = ["zcat", input_filename]
    elif re.match(r'.*\.bz2$', input_filename):
        cmd = ["bzcat", "-dk", input_filename]
+    else:
+        raise ValueError("Unrecognized file type: %s" % input_filename)

    try:
-        input_file = Popen(cmd, stdout=PIPE).stdout
+        return Popen(cmd, stdout=PIPE).stdout
    except NameError:
-        input_file = open(input_filename, 'r')
+        return open(input_filename, 'r')

-    return input_file

 def get_output_filename(input_filename, parquet=False):
    output_filename = re.sub(r'\.(7z|gz|bz2)?$', '', input_filename)
@@ -708,13 +730,16 @@ def get_output_filename(input_filename, parquet = False):
        output_filename = output_filename + ".parquet"
    return output_filename

+
 def open_output_file(input_filename):
    # create a regex that creates the output filename
    output_filename = get_output_filename(input_filename, parquet=False)
    output_file = open(output_filename, "w")
    return output_file

-parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimitted data.')
+
+def main():
+    parser = argparse.ArgumentParser(description='Parse MediaWiki XML database dumps into tab delimited data.')

    # arguments for the input direction
    parser.add_argument('dumpfiles', metavar="DUMPFILE", nargs="*", type=str,
@@ -729,14 +754,15 @@ parser.add_argument('-s', '--stdout', dest="stdout", action="store_true",
    parser.add_argument('--collapse-user', dest="collapse_user", action="store_true",
                        help="Operate only on the final revision made by user a user within all sequences of consecutive edits made by a user. This can be useful for addressing issues with text persistence measures.")

-parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str, choices = ['','segment','sequence','legacy'], nargs='?',
-                    help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The defualt is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")
+    parser.add_argument('-p', '--persistence', dest="persist", default=None, const='', type=str,
+                        choices=['', 'segment', 'sequence', 'legacy'], nargs='?',
+                        help="Compute and report measures of content persistent: (1) persistent token revisions, (2) tokens added, and (3) number of revision used in computing the first measure. This may by slow.  The default is -p=sequence, which uses the same algorithm as in the past, but with improvements to wikitext parsing. Use -p=legacy for old behavior used in older research projects. Use -p=segment for advanced persistence calculation method that is robust to content moves, but prone to bugs, and slower.")

    parser.add_argument('-u', '--url-encode', dest="urlencode", action="store_true",
                        help="Output url encoded text strings. This works around some data issues like newlines in editor names. In the future it may be used to output other text data.")

    parser.add_argument('-n', '--namespace-include', dest="namespace_filter", type=int, action='append',
-                    help="Id number of namspace to include. Can be specified more than once.")
+                        help="Id number of namespace to include. Can be specified more than once.")

    parser.add_argument('-rr',
                        '--revert-radius',
@@ -746,22 +772,23 @@ parser.add_argument('-rr',
                        default=15,
                        help="Number of edits to check when looking for reverts (default: 15)")

-parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str, action='append',
+    parser.add_argument('-RP', '--revision-pattern', dest="regex_match_revision", default=None, type=str,
+                        action='append',
                        help="The regular expression to search for in revision text. The regex must be surrounded by quotes.")

-parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str, action='append',
+    parser.add_argument('-RPl', '--revision-pattern-label', dest="regex_revision_label", default=None, type=str,
+                        action='append',
                        help="The label for the outputted column based on matching the regex in revision text.")

    parser.add_argument('-CP', '--comment-pattern', dest="regex_match_comment", default=None, type=str, action='append',
                        help="The regular expression to search for in comments of revisions.")

-parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str, action='append',
+    parser.add_argument('-CPl', '--comment-pattern-label', dest="regex_comment_label", default=None, type=str,
+                        action='append',
                        help="The label for the outputted column based on matching the regex in comments.")

    args = parser.parse_args()

-
-
    # set persistence method

    if args.persist is None:
@@ -836,3 +863,7 @@ else:

    # stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
    # stop_words = stop_words.split(",")
+
+
+if __name__ == "__main__":
+    main()