add (optional) diff and text columns to output.

2025-07-07 14:39:52 -07:00
parent a8e9e7f4fd
commit d6c4c0a416
4 changed files with 373 additions and 295 deletions
--- a/tables.py
+++ b/tables.py
@@ -218,3 +218,11 @@ class RevisionCollapsed(RevisionField[int]):

    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
        return len(revisions)
+
+
+class RevisionText(RevisionField[str]):
+    field = pa.field("text", pa.string(), nullable=False)
+
+    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
+        revision = revisions[-1]
+        return revision.text
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -1,15 +1,14 @@
-import shutil
-import unittest
 import os
+import shutil
 import subprocess
-
+import tracemalloc
+from io import StringIO
+from typing import Final, Union
+import pytest
 import numpy as np
 import pandas as pd
 from pandas import DataFrame
 from pandas.testing import assert_frame_equal, assert_series_equal
-from io import StringIO
-import tracemalloc
-from typing import Final, Union

 # Make references to files and wikiq relative to this file, not to the current working directory.
 TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
@@ -37,21 +36,26 @@ setup()


 class WikiqTester:
-    def __init__(self,
-                 wiki: str,
-                 case_name: str,
-                 suffix: Union[str, None] = None,
-                 in_compression: str = "bz2",
-                 baseline_format: str = "tsv",
-                 out_format: str = "tsv",
-                 ):
-        self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression))
+    def __init__(
+        self,
+        wiki: str,
+        case_name: str,
+        suffix: Union[str, None] = None,
+        in_compression: str = "bz2",
+        baseline_format: str = "tsv",
+        out_format: str = "tsv",
+    ):
+        self.input_file = os.path.join(
+            TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)
+        )

        basename = "{0}_{1}".format(case_name, wiki)
        if suffix:
            basename = "{0}_{1}".format(basename, suffix)

-        self.output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format))
+        self.output = os.path.join(
+            TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format)
+        )

        if os.path.exists(self.output):
            if os.path.isfile(self.output):
@@ -61,17 +65,21 @@ class WikiqTester:

        if out_format == "parquet":
            os.makedirs(self.output, exist_ok=True)
-
+            
        if suffix is None:
            self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
            self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
        else:
-            self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format)
+            self.wikiq_baseline_name = "{0}_{1}.{2}".format(
+                wiki, suffix, baseline_format
+            )
            self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)

        # If case_name is unset, there are no relevant baseline or test files.
        if case_name is not None:
-            self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name))
+            self.baseline_file = os.path.join(
+                BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)
+            )

    def call_wikiq(self, *args: str, out: bool = True):
        """
@@ -81,296 +89,326 @@ class WikiqTester:
        :return: The output of the wikiq call.
        """
        if out:
-            call = ' '.join([WIKIQ, self.input_file, "-o", self.output, *args])
+            call = " ".join([WIKIQ, self.input_file, "-o", self.output, *args])
        else:
-            call = ' '.join([WIKIQ, self.input_file, *args])
+            call = " ".join([WIKIQ, self.input_file, *args])

        print(call)
        return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)

+    # with / without pwr DONE
+    # with / without url encode DONE
+    # with / without collapse user DONE
+    # with output to stdout DONE
+    # note that the persistence radius is 7 by default
+    # reading various file formats including
+    #        7z, gz, bz2, xml  DONE
+    # wikia and wikipedia data DONE
+    # malformed xmls DONE

-# with / without pwr DONE
-# with / without url encode DONE
-# with / without collapse user DONE
-# with output to stdout DONE
-# note that the persistence radius is 7 by default
-# reading various file formats including
-#        7z, gz, bz2, xml  DONE
-# wikia and wikipedia data DONE
-# malformed xmls DONE
+def test_WP_noargs():
+    tester = WikiqTester(IKWIKI, "noargs")
+    
+    try:
+        tester.call_wikiq()
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+        
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)

-class WikiqTestCase(unittest.TestCase):
-    def test_WP_noargs(self):
-        tester = WikiqTester(IKWIKI, "noargs")
+def test_WP_namespaces():
+    tester = WikiqTester(IKWIKI, "namespaces")

+    try:
+        tester.call_wikiq("-n 0", "-n 1")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # as a test let's make sure that we get equal data frames
+    test = pd.read_table(tester.output)
+    num_wrong_ns = sum(~test.namespace.isin({0, 1}))
+    self.assertEqual(num_wrong_ns, 0)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_WP_revert_radius():
+    tester = WikiqTester(IKWIKI, "revert_radius")
+
+    try:
+        tester.call_wikiq("-n 0", "-n 1", "-rr 1")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # as a test let's make sure that we get equal data frames
+    test = pd.read_table(tester.output)
+    num_wrong_ns = sum(~test.namespace.isin({0, 1}))
+    self.assertEqual(num_wrong_ns, 0)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_WP_no_revert_radius():
+    tester = WikiqTester(IKWIKI, "no_revert_radius")
+
+    try:
+        tester.call_wikiq("-rr 0")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # as a test let's make sure that we get equal data frames
+    test = pd.read_table(tester.output)
+    num_reverted = sum(i is None for i in test.revert)
+    self.assertEqual(num_reverted, 0)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_WP_collapse_user():
+    tester = WikiqTester(IKWIKI, "collapse_user")
+
+    try:
+        tester.call_wikiq("--collapse-user")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_noargs():
+    tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
+
+    try:
+        tester.call_wikiq()
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+        
+def test_collapse_user():
+    tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
+
+    try:
+        tester.call_wikiq("--collapse-user", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_pwr_wikidiff2():
+    tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
+
+    try:
+        tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output)
+    baseline = pd.read_parquet(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_pwr_segment():
+    tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
+
+    try:
+        tester.call_wikiq("--persistence segment", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_pwr_legacy():
+    tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
+
+    try:
+        tester.call_wikiq("--persistence legacy", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_pwr():
+    tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
+
+    try:
+        tester.call_wikiq("--persistence", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(tester.output)
+    baseline = pd.read_table(tester.baseline_file)
+
+    test = test.reindex(columns=sorted(test.columns))
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_diff():
+    tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet')
+
+    try:
+        tester.call_wikiq("--diff", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+    baseline = pd.read_parquet(tester.baseline_file)
+
+    test = test.reindex(columns=sorted(test.columns))
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_text():
+    tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet')
+
+    try:
+        tester.call_wikiq("--diff", "--text","--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+    baseline = pd.read_parquet(tester.baseline_file)
+
+    test = test.reindex(columns=sorted(test.columns))
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_malformed_noargs():
+    tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z")
+    want_exception = (
+        "xml.etree.ElementTree.ParseError: no element found: line 1369, column 0"
+    )
+
+    try:
+        tester.call_wikiq()
+    except subprocess.CalledProcessError as exc:
+        errlines = exc.stderr.decode("utf8").splitlines()
+        self.assertEqual(errlines[-1], want_exception)
+    else:
+        pytest.fail("No exception raised, want: {}".format(want_exception))
+
+def test_stdout_noargs():
+    tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
+
+    try:
+        outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode(
+            "utf8"
+        )
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_table(StringIO(outs))
+    baseline = pd.read_table(tester.baseline_file)
+    assert_frame_equal(test, baseline, check_like=True)
+
+def test_bad_regex():
+    tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex")
+
+    # sample arguments for checking that bad arguments get terminated / test_regex_arguments
+    bad_arguments_list = [
+        # label is missing
+        "-RP '\\b\\d+\\b'",
+        # number of reg and number of labels do not match
+        "-RP 'NPO V' -RP THE -RPl testlabel",
+        # cp but rp label
+        "-CP '(Tamil|Li)' -RPl testlabel",
+        # regex is missing
+        "-CPl testlabel",
+        "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'",
+    ]
+
+    for arguments in bad_arguments_list:
        try:
-            tester.call_wikiq()
+            tester.call_wikiq("--stdout", arguments, out=False)
        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_WP_namespaces(self):
-        tester = WikiqTester(IKWIKI, "namespaces")
-
-        try:
-            tester.call_wikiq("-n 0", "-n 1")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(tester.output)
-        num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
-        self.assertEqual(num_wrong_ns, 0)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_WP_revert_radius(self):
-        tester = WikiqTester(IKWIKI, "revert_radius")
-
-        try:
-            tester.call_wikiq("-n 0", "-n 1", "-rr 1")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(tester.output)
-        num_wrong_ns = sum(~ test.namespace.isin({0, 1}))
-        self.assertEqual(num_wrong_ns, 0)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_WP_no_revert_radius(self):
-        tester = WikiqTester(IKWIKI, "no_revert_radius")
-
-        try:
-            tester.call_wikiq("-rr 0")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        # as a test let's make sure that we get equal data frames
-        test = pd.read_table(tester.output)
-        num_reverted = sum(i is None for i in test.revert)
-        self.assertEqual(num_reverted, 0)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_WP_collapse_user(self):
-        tester = WikiqTester(IKWIKI, "collapse_user")
-
-        try:
-            tester.call_wikiq("--collapse-user")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_noargs(self):
-        tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
-
-        try:
-            tester.call_wikiq()
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_collapse_user(self):
-        tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
-
-        try:
-            tester.call_wikiq("--collapse-user", "--fandom-2020")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_pwr_wikidiff2(self):
-        tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
-
-        try:
-            tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_pwr_segment(self):
-        tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
-
-        try:
-            tester.call_wikiq("--persistence segment", "--fandom-2020")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_pwr_legacy(self):
-        tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
-
-        try:
-            tester.call_wikiq("--persistence legacy", "--fandom-2020")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_pwr(self):
-        tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
-
-        try:
-            tester.call_wikiq("--persistence", "--fandom-2020")
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
-
-        test = pd.read_table(tester.output)
-        baseline = pd.read_table(tester.baseline_file)
-
-        test = test.reindex(columns=sorted(test.columns))
-        assert_frame_equal(test, baseline, check_like=True)
-
-    def test_malformed_noargs(self):
-        tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z")
-        want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0'
-
-        try:
-            tester.call_wikiq()
-        except subprocess.CalledProcessError as exc:
-            errlines = exc.stderr.decode("utf8").splitlines()
-            self.assertEqual(errlines[-1], want_exception)
+            # we want to check that the bad arguments were caught and sys.exit is stopping the code
+            print(exc.stderr.decode("utf-8"))
        else:
-            self.fail("No exception raised, want: {}".format(want_exception))
+            pytest.fail("No exception raised, want Exception")

-    def test_stdout_noargs(self):
-        tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
+def test_good_regex():
+    # sample arguments for checking the outcomes of good arguments / test_basic_regex
+    good_arguments_list = [
+        "-RP '\\b\\d{3}\\b' -RPl threedigits",
+        "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
+        "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
+        "-CP 'WP:EVADE' -CPl wp_evade",
+    ]
+
+    for i, arguments in enumerate(good_arguments_list):
+        tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i))

        try:
-            outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode("utf8")
+            tester.call_wikiq(arguments)
        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+            pytest.fail(exc.stderr.decode("utf8"))
+
+        test = pd.read_table(tester.output)
+
+        baseline = pd.read_table(tester.baseline_file)
+        assert_frame_equal(test, baseline, check_like=True)
+        print(i)
+
+def test_capturegroup_regex():
+    cap_arguments_list = [
+        "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
+        "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov",
+    ]
+
+    for i, arguments in enumerate(cap_arguments_list):
+        tester = WikiqTester(
+            wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)
+        )
+
+        try:
+            tester.call_wikiq(arguments)
+        except subprocess.CalledProcessError as exc:
+            pytest.fail(exc.stderr.decode("utf8"))
+
+        test = pd.read_table(tester.output)

-        test = pd.read_table(StringIO(outs))
        baseline = pd.read_table(tester.baseline_file)
        assert_frame_equal(test, baseline, check_like=True)

-    def test_bad_regex(self):
-        tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex")
+def test_parquet():
+    tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")

-        # sample arguments for checking that bad arguments get terminated / test_regex_arguments
-        bad_arguments_list = [
-            # label is missing
-            "-RP '\\b\\d+\\b'",
-            # number of reg and number of labels do not match
-            "-RP 'NPO V' -RP THE -RPl testlabel",
-            # cp but rp label
-            "-CP '(Tamil|Li)' -RPl testlabel",
-            # regex is missing
-            "-CPl testlabel",
-            "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'"
-        ]
+    try:
+        tester.call_wikiq()
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))

-        for arguments in bad_arguments_list:
-            try:
-                tester.call_wikiq("--stdout", arguments, out=False)
-            except subprocess.CalledProcessError as exc:
-                # we want to check that the bad arguments were caught and sys.exit is stopping the code
-                print(exc.stderr.decode("utf-8"))
-            else:
-                self.fail("No exception raised, want Exception")
+    # as a test let's make sure that we get equal data frames
+    test: DataFrame = pd.read_parquet(tester.output)
+    # test = test.drop(['reverteds'], axis=1)

-    def test_good_regex(self):
-        # sample arguments for checking the outcomes of good arguments / test_basic_regex
-        good_arguments_list = [
-            "-RP '\\b\\d{3}\\b' -RPl threedigits",
-            "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
-            "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
-            "-CP 'WP:EVADE' -CPl wp_evade"
-        ]
+    baseline: DataFrame = pd.read_table(tester.baseline_file)

-        for i, arguments in enumerate(good_arguments_list):
-            tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i))
+    # Pandas does not read timestamps as the desired datetime type.
+    baseline["date_time"] = pd.to_datetime(baseline["date_time"])
+    # Split strings to the arrays of reverted IDs so they can be compared.
+    baseline["revert"] = baseline["revert"].replace(np.nan, None)
+    baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None)
+    # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
+    baseline["sha1"] = baseline["sha1"].replace(np.nan, None)
+    baseline["editor"] = baseline["editor"].replace(np.nan, None)
+    baseline["anon"] = baseline["anon"].replace(np.nan, None)

-            try:
-                tester.call_wikiq(arguments)
-            except subprocess.CalledProcessError as exc:
-                self.fail(exc.stderr.decode("utf8"))
-
-            test = pd.read_table(tester.output)
-
-            baseline = pd.read_table(tester.baseline_file)
-            assert_frame_equal(test, baseline, check_like=True)
-            print(i)
-
-    def test_capturegroup_regex(self):
-        cap_arguments_list = [
-            "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
-            "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov"
-        ]
-
-        for i, arguments in enumerate(cap_arguments_list):
-            tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i))
-
-            try:
-                tester.call_wikiq(arguments)
-            except subprocess.CalledProcessError as exc:
-                self.fail(exc.stderr.decode("utf8"))
-
-            test = pd.read_table(tester.output)
-
-            baseline = pd.read_table(tester.baseline_file)
-            assert_frame_equal(test, baseline, check_like=True)
-
-    def test_parquet(self):
-        tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
+    for index, row in baseline.iterrows():
+        if row["revert"] != test["revert"][index]:
+            print(row["revid"], ":", row["revert"], "!=", test["revert"][index])

+    for col in baseline.columns:
        try:
-            tester.call_wikiq()
-        except subprocess.CalledProcessError as exc:
-            self.fail(exc.stderr.decode("utf8"))
+            assert_series_equal(
+                test[col], baseline[col], check_like=True, check_dtype=False
+            )
+        except ValueError as exc:
+            print(f"Error comparing column {col}")
+            pytest.fail(exc)

-        # as a test let's make sure that we get equal data frames
-        test: DataFrame = pd.read_parquet(tester.output)
-        # test = test.drop(['reverteds'], axis=1)
-
-        baseline: DataFrame = pd.read_table(tester.baseline_file)
-
-        # Pandas does not read timestamps as the desired datetime type.
-        baseline['date_time'] = pd.to_datetime(baseline['date_time'])
-        # Split strings to the arrays of reverted IDs so they can be compared.
-        baseline['revert'] = baseline['revert'].replace(np.nan, None)
-        baseline['reverteds'] = baseline['reverteds'].replace(np.nan, None)
-        # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
-        baseline['sha1'] = baseline['sha1'].replace(np.nan, None)
-        baseline['editor'] = baseline['editor'].replace(np.nan, None)
-        baseline['anon'] = baseline['anon'].replace(np.nan, None)
-
-        for index, row in baseline.iterrows():
-            if row['revert'] != test['revert'][index]:
-                print(row['revid'], ":", row['revert'], "!=", test['revert'][index])
-
-        for col in baseline.columns:
-            try:
-                assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False)
-            except ValueError as exc:
-                print(f"Error comparing column {col}")
-                self.fail(exc)
-
-        # assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
-
-
-if __name__ == '__main__':
-    unittest.main()
+    # assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@@ -16,10 +16,10 @@ import pywikidiff2
 class DiffToOperationMap:
    def __init__(self, diff, tokenizer):
        self.tokenizer = tokenizer
-        self.diff = json.loads(diff)
        self.from_par_move_dict = {}
        self.to_par_move_dict = {}
        self.highlights_without_offset = []
+        self.diff = diff
        # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
        self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
        self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
@@ -375,7 +375,7 @@ class WikiDiffMatcher:
            numContextLines=1000000, moved_paragraph_detection_cutoff=200000
        )
        # Pre-compute diffs to reduce traffic overhead.
-        self.diffs = differ.inline_json_diff_sequence(list(texts))
+        self.diffs = [json.loads(diff) for diff in differ.inline_json_diff_sequence(list(texts))]
        self.tokenizer = tokenizer or TOKENIZER

    class Processor(DiffEngine.Processor):
--- a/48
+++ b/48
@@ -208,6 +208,8 @@ class WikiqParser:
                 regex_match_comment: list[str],
                 regex_revision_label: list[str],
                 regex_comment_label: list[str],
+                 text: bool = False,
+                 diff: bool = False,
                 collapse_user: bool = False,
                 persist: int = None,
                 namespaces: Union[list[int], None] = None,
@@ -226,7 +228,8 @@ class WikiqParser:
        self.persist: int = persist
        self.namespaces = []
        self.revert_radius = revert_radius
-
+        self.diff = diff
+        self.text = text
        if namespaces is not None:
            self.namespace_filter = set(namespaces)
        else:
@@ -331,6 +334,9 @@ class WikiqParser:
            tables.RevisionIsAnon(),
        ])

+        if self.text:
+            table.columns.append(tables.RevisionText())
+
        if self.collapse_user:
            table.columns.append(tables.RevisionCollapsed())

@@ -345,6 +351,10 @@ class WikiqParser:
        schema = table.schema()
        schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))

+        if self.diff:
+            from diff_pyarrow_schema import diff_field
+            schema = schema.append(diff_field)
+
        # Add regex fields to the schema.
        for pair in self.regex_revision_pairs:
            for field in pair.get_pyarrow_fields():
@@ -412,9 +422,18 @@ class WikiqParser:

                revision_texts.append(rev.text)

+            wikidiff_matcher = None
+            if self.diff or self.persist == PersistMethod.wikidiff2:
+                wikidiff_matcher = WikiDiffMatcher(revision_texts,
+                                                   tokenizer=wikitext_split,
+                                                   )
+
            # Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
            row_buffer = table.pop()

+            if self.diff:
+                row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts))]
+
            is_revert_column: list[Union[bool, None]] = []
            for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
                if self.revert_radius == 0 or d:
@@ -428,6 +447,7 @@ class WikiqParser:
                row_buffer[k] = v
                regex_matches = {}

+
            if self.persist != PersistMethod.none:
                window = deque(maxlen=PERSISTENCE_RADIUS)

@@ -443,9 +463,8 @@ class WikiqParser:
                    state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
                                                    revert_radius=PERSISTENCE_RADIUS)
                elif self.persist == PersistMethod.wikidiff2:
-                    state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
-                                                                    tokenizer=wikitext_split,
-                                                                    ),
+
+                    state = mwpersistence.DiffState(wikidiff_matcher,
                                                    revert_radius=PERSISTENCE_RADIUS)
                else:
                    from mw.lib import persistence
@@ -469,8 +488,6 @@ class WikiqParser:
                        row_buffer['tokens_removed'].append(len(old_tokens_removed))
                        row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1)

-                del row_buffer['text']
-
                # print out metadata for the last RADIUS revisions
                for i, item in enumerate(window):
                    # if the window was full, we've already printed item 0
@@ -485,6 +502,9 @@ class WikiqParser:
                    row_buffer['tokens_removed'].append(len(tokens_removed))
                    row_buffer['tokens_window'].append(len(window) - (i + 1))

+                if not self.text:
+                    del row_buffer['text']
+
            writer.write(pa.table(row_buffer, schema=schema))

            page_count += 1
@@ -494,7 +514,6 @@ class WikiqParser:

        writer.close()

-
 def match_archive_suffix(input_filename):
    if re.match(r'.*\.7z$', input_filename):
        cmd = ["7za", "x", "-so", input_filename]
@@ -580,6 +599,14 @@ def main():
                        action='append',
                        help="The label for the outputted column based on matching the regex in comments.")

+    parser.add_argument('-d', '--diff', dest="diff", default=False,
+                        action='store_true',
+                        help="Output a diff structure for each revision with information about changed or moved lines.")
+
+    parser.add_argument('-t', '--text', dest="text", default=False,
+                        action='store_true',
+                        help="Output the text of the revision.")
+
    parser.add_argument('--fandom-2020', dest="fandom_2020",
                        action='store_true',
                        help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
@@ -604,6 +631,7 @@ def main():
    else:
        namespaces = None

+    print(args, file=sys.stderr)
    if len(args.dumpfiles) > 0:
        for filename in args.dumpfiles:
            input_file = open_input_file(filename, args.fandom_2020)
@@ -637,6 +665,8 @@ def main():
                                regex_revision_label=args.regex_revision_label,
                                regex_match_comment=args.regex_match_comment,
                                regex_comment_label=args.regex_comment_label,
+                                text=args.text,
+                                diff=args.diff,
                                output_parquet=output_parquet,
                                )

@@ -656,7 +686,9 @@ def main():
                            regex_match_revision=args.regex_match_revision,
                            regex_revision_label=args.regex_revision_label,
                            regex_match_comment=args.regex_match_comment,
-                            regex_comment_label=args.regex_comment_label)
+                            regex_comment_label=args.regex_comment_label,
+                            diff=args.diff,
+                            text=args.text)

        wikiq.process()