add (optional) diff and text columns to output.

2025-07-07 14:39:52 -07:00 · 2025-07-07 14:39:52 -07:00 · d6c4c0a416
commit d6c4c0a416
parent a8e9e7f4fd
4 changed files with 373 additions and 295 deletions
--- a/tables.py
+++ b/tables.py
@ -218,3 +218,11 @@ class RevisionCollapsed(RevisionField[int]):
    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
        return len(revisions)
 class RevisionText(RevisionField[str]):
    field = pa.field("text", pa.string(), nullable=False)
    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
        revision = revisions[-1]
        return revision.text
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@ -1,15 +1,14 @@
 import shutil
 import unittest
 import os
 import shutil
 import subprocess
-
+import tracemalloc
 from io import StringIO
 from typing import Final, Union
 import pytest
 import numpy as np
 import pandas as pd
 from pandas import DataFrame
 from pandas.testing import assert_frame_equal, assert_series_equal
 from io import StringIO
 import tracemalloc
 from typing import Final, Union
 # Make references to files and wikiq relative to this file, not to the current working directory.
 TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
@ -37,7 +36,8 @@ setup()
 class WikiqTester:
-    def __init__(self,
+    def __init__(
        self,
        wiki: str,
        case_name: str,
        suffix: Union[str, None] = None,
@ -45,13 +45,17 @@ class WikiqTester:
        baseline_format: str = "tsv",
        out_format: str = "tsv",
    ):
-        self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
+        self.input_file = os.path.join(
            TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)
        )
        basename = "{0}_{1}".format(case_name, wiki)
        if suffix:
            basename = "{0}_{1}".format(basename, suffix)
-        self.output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format))
+        self.output = os.path.join(
            TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format)
        )
        if os.path.exists(self.output):
            if os.path.isfile(self.output):
@ -66,12 +70,16 @@ class WikiqTester:
            self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
            self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
        else:
-            self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format)
+            self.wikiq_baseline_name = "{0}_{1}.{2}".format(
                wiki, suffix, baseline_format
            )
            self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
        # If case_name is unset, there are no relevant baseline or test files.
        if case_name is not None:
-            self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name))
+            self.baseline_file = os.path.join(
                BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)
            )
    def call_wikiq(self, *args: str, out: bool = True):
        """
@ -81,74 +89,72 @@ class WikiqTester:
        :return: The output of the wikiq call.
        """
        if out:
-            call = ' '.join([WIKIQ, self.input_file, "-o", self.output, *args])
+            call = " ".join([WIKIQ, self.input_file, "-o", self.output, *args])
        else:
-            call = ' '.join([WIKIQ, self.input_file, *args])
+            call = " ".join([WIKIQ, self.input_file, *args])
        print(call)
        return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
    # with / without pwr DONE
    # with / without url encode DONE
    # with / without collapse user DONE
    # with output to stdout DONE
    # note that the persistence radius is 7 by default
    # reading various file formats including
    #        7z, gz, bz2, xml  DONE
    # wikia and wikipedia data DONE
    # malformed xmls DONE
-# with / without pwr DONE
+def test_WP_noargs():
 # with / without url encode DONE
 # with / without collapse user DONE
 # with output to stdout DONE
 # note that the persistence radius is 7 by default
 # reading various file formats including
 #        7z, gz, bz2, xml  DONE
 # wikia and wikipedia data DONE
 # malformed xmls DONE
 class WikiqTestCase(unittest.TestCase):
    def test_WP_noargs(self):
    tester = WikiqTester(IKWIKI, "noargs")
    try:
        tester.call_wikiq()
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_WP_namespaces(self):
+def test_WP_namespaces():
    tester = WikiqTester(IKWIKI, "namespaces")
    try:
        tester.call_wikiq("-n 0", "-n 1")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    # as a test let's make sure that we get equal data frames
    test = pd.read_table(tester.output)
-        num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
+    num_wrong_ns = sum(~test.namespace.isin({0, 1}))
    self.assertEqual(num_wrong_ns, 0)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_WP_revert_radius(self):
+def test_WP_revert_radius():
    tester = WikiqTester(IKWIKI, "revert_radius")
    try:
        tester.call_wikiq("-n 0", "-n 1", "-rr 1")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    # as a test let's make sure that we get equal data frames
    test = pd.read_table(tester.output)
-        num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
+    num_wrong_ns = sum(~test.namespace.isin({0, 1}))
    self.assertEqual(num_wrong_ns, 0)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_WP_no_revert_radius(self):
+def test_WP_no_revert_radius():
    tester = WikiqTester(IKWIKI, "no_revert_radius")
    try:
        tester.call_wikiq("-rr 0")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    # as a test let's make sure that we get equal data frames
    test = pd.read_table(tester.output)
@ -157,85 +163,85 @@ class WikiqTestCase(unittest.TestCase):
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_WP_collapse_user(self):
+def test_WP_collapse_user():
    tester = WikiqTester(IKWIKI, "collapse_user")
    try:
        tester.call_wikiq("--collapse-user")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_noargs(self):
+def test_noargs():
    tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
    try:
        tester.call_wikiq()
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_collapse_user(self):
+def test_collapse_user():
    tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
    try:
        tester.call_wikiq("--collapse-user", "--fandom-2020")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_pwr_wikidiff2(self):
+def test_pwr_wikidiff2():
    tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
    try:
        tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
-        test = pd.read_table(tester.output)
+    test = pd.read_parquet(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
+    baseline = pd.read_parquet(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_pwr_segment(self):
+def test_pwr_segment():
    tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
    try:
        tester.call_wikiq("--persistence segment", "--fandom-2020")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_pwr_legacy(self):
+def test_pwr_legacy():
    tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
    try:
        tester.call_wikiq("--persistence legacy", "--fandom-2020")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_pwr(self):
+def test_pwr():
    tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
    try:
        tester.call_wikiq("--persistence", "--fandom-2020")
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(tester.output)
    baseline = pd.read_table(tester.baseline_file)
@ -243,9 +249,39 @@ class WikiqTestCase(unittest.TestCase):
    test = test.reindex(columns=sorted(test.columns))
    assert_frame_equal(test, baseline, check_like=True)
-    def test_malformed_noargs(self):
+def test_diff():
    tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet')
    try:
        tester.call_wikiq("--diff", "--fandom-2020")
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
    baseline = pd.read_parquet(tester.baseline_file)
    test = test.reindex(columns=sorted(test.columns))
    assert_frame_equal(test, baseline, check_like=True)
 def test_text():
    tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet')
    try:
        tester.call_wikiq("--diff", "--text","--fandom-2020")
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
    baseline = pd.read_parquet(tester.baseline_file)
    test = test.reindex(columns=sorted(test.columns))
    assert_frame_equal(test, baseline, check_like=True)
 def test_malformed_noargs():
    tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z")
-        want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'
+    want_exception = (
        "xml.etree.ElementTree.ParseError: no element found: line 1369, column 0"
    )
    try:
        tester.call_wikiq()
@ -253,21 +289,23 @@ class WikiqTestCase(unittest.TestCase):
        errlines = exc.stderr.decode("utf8").splitlines()
        self.assertEqual(errlines[-1], want_exception)
    else:
-            self.fail("No exception raised, want: {}".format(want_exception))
+        pytest.fail("No exception raised, want: {}".format(want_exception))
-    def test_stdout_noargs(self):
+def test_stdout_noargs():
    tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
    try:
-            outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode("utf8")
+        outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode(
            "utf8"
        )
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    test = pd.read_table(StringIO(outs))
    baseline = pd.read_table(tester.baseline_file)
    assert_frame_equal(test, baseline, check_like=True)
-    def test_bad_regex(self):
+def test_bad_regex():
    tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex")
    # sample arguments for checking that bad arguments get terminated / test_regex_arguments
@ -280,7 +318,7 @@ class WikiqTestCase(unittest.TestCase):
        "-CP '(Tamil|Li)' -RPl testlabel",
        # regex is missing
        "-CPl testlabel",
-            "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
+        "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'",
    ]
    for arguments in bad_arguments_list:
@ -290,15 +328,15 @@ class WikiqTestCase(unittest.TestCase):
            # we want to check that the bad arguments were caught and sys.exit is stopping the code
            print(exc.stderr.decode("utf-8"))
        else:
-                self.fail("No exception raised, want Exception")
+            pytest.fail("No exception raised, want Exception")
-    def test_good_regex(self):
+def test_good_regex():
    # sample arguments for checking the outcomes of good arguments / test_basic_regex
    good_arguments_list = [
        "-RP '\\b\\d{3}\\b' -RPl threedigits",
        "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
        "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
-            "-CP 'WP:EVADE' -CPl wp_evade"
+        "-CP 'WP:EVADE' -CPl wp_evade",
    ]
    for i, arguments in enumerate(good_arguments_list):
@ -307,7 +345,7 @@ class WikiqTestCase(unittest.TestCase):
        try:
            tester.call_wikiq(arguments)
        except subprocess.CalledProcessError as exc:
-                self.fail(exc.stderr.decode("utf8"))
+            pytest.fail(exc.stderr.decode("utf8"))
        test = pd.read_table(tester.output)
@ -315,32 +353,34 @@ class WikiqTestCase(unittest.TestCase):
        assert_frame_equal(test, baseline, check_like=True)
        print(i)
-    def test_capturegroup_regex(self):
+def test_capturegroup_regex():
    cap_arguments_list = [
        "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
-            "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
+        "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov",
    ]
    for i, arguments in enumerate(cap_arguments_list):
-            tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i))
+        tester = WikiqTester(
            wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)
        )
        try:
            tester.call_wikiq(arguments)
        except subprocess.CalledProcessError as exc:
-                self.fail(exc.stderr.decode("utf8"))
+            pytest.fail(exc.stderr.decode("utf8"))
        test = pd.read_table(tester.output)
        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)
-    def test_parquet(self):
+def test_parquet():
    tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
    try:
        tester.call_wikiq()
    except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+        pytest.fail(exc.stderr.decode("utf8"))
    # as a test let's make sure that we get equal data frames
    test: DataFrame = pd.read_parquet(tester.output)
@ -349,28 +389,26 @@ class WikiqTestCase(unittest.TestCase):
    baseline: DataFrame = pd.read_table(tester.baseline_file)
    # Pandas does not read timestamps as the desired datetime type.
-        baseline['date_time'] = pd.to_datetime(baseline['date_time'])
+    baseline["date_time"] = pd.to_datetime(baseline["date_time"])
    # Split strings to the arrays of reverted IDs so they can be compared.
-        baseline['revert'] = baseline['revert'].replace(np.nan, None)
+    baseline["revert"] = baseline["revert"].replace(np.nan, None)
-        baseline['reverteds'] = baseline['reverteds'].replace(np.nan, None)
+    baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None)
    # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
-        baseline['sha1'] = baseline['sha1'].replace(np.nan, None)
+    baseline["sha1"] = baseline["sha1"].replace(np.nan, None)
-        baseline['editor'] = baseline['editor'].replace(np.nan, None)
+    baseline["editor"] = baseline["editor"].replace(np.nan, None)
-        baseline['anon'] = baseline['anon'].replace(np.nan, None)
+    baseline["anon"] = baseline["anon"].replace(np.nan, None)
    for index, row in baseline.iterrows():
-            if row['revert'] != test['revert'][index]:
+        if row["revert"] != test["revert"][index]:
-                print(row['revid'], ":", row['revert'], "!=", test['revert'][index])
+            print(row["revid"], ":", row["revert"], "!=", test["revert"][index])
    for col in baseline.columns:
        try:
-                assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
+            assert_series_equal(
                test[col], baseline[col], check_like=True, check_dtype=False
            )
        except ValueError as exc:
            print(f"Error comparing column {col}")
-                self.fail(exc)
+            pytest.fail(exc)
    # assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
 if __name__ == '__main__':
    unittest.main()
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@ -16,10 +16,10 @@ import pywikidiff2
 class DiffToOperationMap:
    def __init__(self, diff, tokenizer):
        self.tokenizer = tokenizer
        self.diff = json.loads(diff)
        self.from_par_move_dict = {}
        self.to_par_move_dict = {}
        self.highlights_without_offset = []
        self.diff = diff
        # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
        self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
        self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
@ -375,7 +375,7 @@ class WikiDiffMatcher:
            numContextLines=1000000, moved_paragraph_detection_cutoff=200000
        )
        # Pre-compute diffs to reduce traffic overhead.
-        self.diffs = differ.inline_json_diff_sequence(list(texts))
+        self.diffs = [json.loads(diff) for diff in differ.inline_json_diff_sequence(list(texts))]
        self.tokenizer = tokenizer or TOKENIZER
    class Processor(DiffEngine.Processor):
--- a/48
+++ b/48
@ -208,6 +208,8 @@ class WikiqParser:
                 regex_match_comment: list[str],
                 regex_revision_label: list[str],
                 regex_comment_label: list[str],
                 text: bool = False,
                 diff: bool = False,
                 collapse_user: bool = False,
                 persist: int = None,
                 namespaces: Union[list[int], None] = None,
@ -226,7 +228,8 @@ class WikiqParser:
        self.persist: int = persist
        self.namespaces = []
        self.revert_radius = revert_radius
-
+        self.diff = diff
        self.text = text
        if namespaces is not None:
            self.namespace_filter = set(namespaces)
        else:
@ -331,6 +334,9 @@ class WikiqParser:
            tables.RevisionIsAnon(),
        ])
        if self.text:
            table.columns.append(tables.RevisionText())
        if self.collapse_user:
            table.columns.append(tables.RevisionCollapsed())
@ -345,6 +351,10 @@ class WikiqParser:
        schema = table.schema()
        schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
        if self.diff:
            from diff_pyarrow_schema import diff_field
            schema = schema.append(diff_field)
        # Add regex fields to the schema.
        for pair in self.regex_revision_pairs:
            for field in pair.get_pyarrow_fields():
@ -412,9 +422,18 @@ class WikiqParser:
                revision_texts.append(rev.text)
            wikidiff_matcher = None
            if self.diff or self.persist == PersistMethod.wikidiff2:
                wikidiff_matcher = WikiDiffMatcher(revision_texts,
                                                   tokenizer=wikitext_split,
                                                   )
            # Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
            row_buffer = table.pop()
            if self.diff:
                row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts))]
            is_revert_column: list[Union[bool, None]] = []
            for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
                if self.revert_radius == 0 or d:
@ -428,6 +447,7 @@ class WikiqParser:
                row_buffer[k] = v
                regex_matches = {}
            if self.persist != PersistMethod.none:
                window = deque(maxlen=PERSISTENCE_RADIUS)
@ -443,9 +463,8 @@ class WikiqParser:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.wikidiff2:
-                    state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
+
-                                                                    tokenizer=wikitext_split,
+                    state = mwpersistence.DiffState(wikidiff_matcher,
                                                                    ),
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence
@ -469,8 +488,6 @@ class WikiqParser:
                        row_buffer['tokens_removed'].append(len(old_tokens_removed))
                        row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1)
                del row_buffer['text']
                # print out metadata for the last RADIUS revisions
                for i, item in enumerate(window):
                    # if the window was full, we've already printed item 0
@ -485,6 +502,9 @@ class WikiqParser:
                    row_buffer['tokens_removed'].append(len(tokens_removed))
                    row_buffer['tokens_window'].append(len(window) - (i + 1))
                if not self.text:
                    del row_buffer['text']
            writer.write(pa.table(row_buffer, schema=schema))
            page_count += 1
@ -494,7 +514,6 @@ class WikiqParser:
        writer.close()
 def match_archive_suffix(input_filename):
    if re.match(r'.*\.7z$', input_filename):
        cmd = ["7za", "x", "-so", input_filename]
@ -580,6 +599,14 @@ def main():
                        action='append',
                        help="The label for the outputted column based on matching the regex in comments.")
    parser.add_argument('-d', '--diff', dest="diff", default=False,
                        action='store_true',
                        help="Output a diff structure for each revision with information about changed or moved lines.")
    parser.add_argument('-t', '--text', dest="text", default=False,
                        action='store_true',
                        help="Output the text of the revision.")
    parser.add_argument('--fandom-2020', dest="fandom_2020",
                        action='store_true',
                        help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
@ -604,6 +631,7 @@ def main():
    else:
        namespaces = None
    print(args, file=sys.stderr)
    if len(args.dumpfiles) > 0:
        for filename in args.dumpfiles:
            input_file = open_input_file(filename, args.fandom_2020)
@ -637,6 +665,8 @@ def main():
                                regex_revision_label=args.regex_revision_label,
                                regex_match_comment=args.regex_match_comment,
                                regex_comment_label=args.regex_comment_label,
                                text=args.text,
                                diff=args.diff,
                                output_parquet=output_parquet,
                                )
@ -656,7 +686,9 @@ def main():
                            regex_match_revision=args.regex_match_revision,
                            regex_revision_label=args.regex_revision_label,
                            regex_match_comment=args.regex_match_comment,
-                            regex_comment_label=args.regex_comment_label)
+                            regex_comment_label=args.regex_comment_label,
                            diff=args.diff,
                            text=args.text)
        wikiq.process()