From d6c4c0a4167076169b3c036e7d784c438e840763 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Mon, 7 Jul 2025 14:39:52 -0700 Subject: [PATCH] add (optional) diff and text columns to output. --- tables.py | 8 + test/Wikiq_Unit_Test.py | 608 +++++++++++++++++++++------------------- wiki_diff_matcher.py | 4 +- wikiq | 48 +++- 4 files changed, 373 insertions(+), 295 deletions(-) diff --git a/tables.py b/tables.py index 7209465..7df0263 100644 --- a/tables.py +++ b/tables.py @@ -218,3 +218,11 @@ class RevisionCollapsed(RevisionField[int]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: return len(revisions) + + +class RevisionText(RevisionField[str]): + field = pa.field("text", pa.string(), nullable=False) + + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: + revision = revisions[-1] + return revision.text diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index 13d62ff..ad57f3a 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -1,15 +1,14 @@ -import shutil -import unittest import os +import shutil import subprocess - +import tracemalloc +from io import StringIO +from typing import Final, Union +import pytest import numpy as np import pandas as pd from pandas import DataFrame from pandas.testing import assert_frame_equal, assert_series_equal -from io import StringIO -import tracemalloc -from typing import Final, Union # Make references to files and wikiq relative to this file, not to the current working directory. TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) @@ -37,21 +36,26 @@ setup() class WikiqTester: - def __init__(self, - wiki: str, - case_name: str, - suffix: Union[str, None] = None, - in_compression: str = "bz2", - baseline_format: str = "tsv", - out_format: str = "tsv", - ): - self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) + def __init__( + self, + wiki: str, + case_name: str, + suffix: Union[str, None] = None, + in_compression: str = "bz2", + baseline_format: str = "tsv", + out_format: str = "tsv", + ): + self.input_file = os.path.join( + TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression) + ) basename = "{0}_{1}".format(case_name, wiki) if suffix: basename = "{0}_{1}".format(basename, suffix) - self.output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format)) + self.output = os.path.join( + TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format) + ) if os.path.exists(self.output): if os.path.isfile(self.output): @@ -61,17 +65,21 @@ class WikiqTester: if out_format == "parquet": os.makedirs(self.output, exist_ok=True) - + if suffix is None: self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) else: - self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format) + self.wikiq_baseline_name = "{0}_{1}.{2}".format( + wiki, suffix, baseline_format + ) self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) # If case_name is unset, there are no relevant baseline or test files. if case_name is not None: - self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)) + self.baseline_file = os.path.join( + BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name) + ) def call_wikiq(self, *args: str, out: bool = True): """ @@ -81,296 +89,326 @@ class WikiqTester: :return: The output of the wikiq call. """ if out: - call = ' '.join([WIKIQ, self.input_file, "-o", self.output, *args]) + call = " ".join([WIKIQ, self.input_file, "-o", self.output, *args]) else: - call = ' '.join([WIKIQ, self.input_file, *args]) + call = " ".join([WIKIQ, self.input_file, *args]) print(call) return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) + # with / without pwr DONE + # with / without url encode DONE + # with / without collapse user DONE + # with output to stdout DONE + # note that the persistence radius is 7 by default + # reading various file formats including + # 7z, gz, bz2, xml DONE + # wikia and wikipedia data DONE + # malformed xmls DONE -# with / without pwr DONE -# with / without url encode DONE -# with / without collapse user DONE -# with output to stdout DONE -# note that the persistence radius is 7 by default -# reading various file formats including -# 7z, gz, bz2, xml DONE -# wikia and wikipedia data DONE -# malformed xmls DONE +def test_WP_noargs(): + tester = WikiqTester(IKWIKI, "noargs") + + try: + tester.call_wikiq() + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) -class WikiqTestCase(unittest.TestCase): - def test_WP_noargs(self): - tester = WikiqTester(IKWIKI, "noargs") +def test_WP_namespaces(): + tester = WikiqTester(IKWIKI, "namespaces") + try: + tester.call_wikiq("-n 0", "-n 1") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + # as a test let's make sure that we get equal data frames + test = pd.read_table(tester.output) + num_wrong_ns = sum(~test.namespace.isin({0, 1})) + self.assertEqual(num_wrong_ns, 0) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_WP_revert_radius(): + tester = WikiqTester(IKWIKI, "revert_radius") + + try: + tester.call_wikiq("-n 0", "-n 1", "-rr 1") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + # as a test let's make sure that we get equal data frames + test = pd.read_table(tester.output) + num_wrong_ns = sum(~test.namespace.isin({0, 1})) + self.assertEqual(num_wrong_ns, 0) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_WP_no_revert_radius(): + tester = WikiqTester(IKWIKI, "no_revert_radius") + + try: + tester.call_wikiq("-rr 0") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + # as a test let's make sure that we get equal data frames + test = pd.read_table(tester.output) + num_reverted = sum(i is None for i in test.revert) + self.assertEqual(num_reverted, 0) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_WP_collapse_user(): + tester = WikiqTester(IKWIKI, "collapse_user") + + try: + tester.call_wikiq("--collapse-user") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_noargs(): + tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") + + try: + tester.call_wikiq() + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_collapse_user(): + tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") + + try: + tester.call_wikiq("--collapse-user", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_pwr_wikidiff2(): + tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z") + + try: + tester.call_wikiq("--persistence wikidiff2", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output) + baseline = pd.read_parquet(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_pwr_segment(): + tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") + + try: + tester.call_wikiq("--persistence segment", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_pwr_legacy(): + tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") + + try: + tester.call_wikiq("--persistence legacy", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_pwr(): + tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") + + try: + tester.call_wikiq("--persistence", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + baseline = pd.read_table(tester.baseline_file) + + test = test.reindex(columns=sorted(test.columns)) + assert_frame_equal(test, baseline, check_like=True) + +def test_diff(): + tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet') + + try: + tester.call_wikiq("--diff", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + baseline = pd.read_parquet(tester.baseline_file) + + test = test.reindex(columns=sorted(test.columns)) + assert_frame_equal(test, baseline, check_like=True) + +def test_text(): + tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet') + + try: + tester.call_wikiq("--diff", "--text","--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + baseline = pd.read_parquet(tester.baseline_file) + + test = test.reindex(columns=sorted(test.columns)) + assert_frame_equal(test, baseline, check_like=True) + +def test_malformed_noargs(): + tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z") + want_exception = ( + "xml.etree.ElementTree.ParseError: no element found: line 1369, column 0" + ) + + try: + tester.call_wikiq() + except subprocess.CalledProcessError as exc: + errlines = exc.stderr.decode("utf8").splitlines() + self.assertEqual(errlines[-1], want_exception) + else: + pytest.fail("No exception raised, want: {}".format(want_exception)) + +def test_stdout_noargs(): + tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") + + try: + outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode( + "utf8" + ) + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(StringIO(outs)) + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + +def test_bad_regex(): + tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex") + + # sample arguments for checking that bad arguments get terminated / test_regex_arguments + bad_arguments_list = [ + # label is missing + "-RP '\\b\\d+\\b'", + # number of reg and number of labels do not match + "-RP 'NPO V' -RP THE -RPl testlabel", + # cp but rp label + "-CP '(Tamil|Li)' -RPl testlabel", + # regex is missing + "-CPl testlabel", + "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'", + ] + + for arguments in bad_arguments_list: try: - tester.call_wikiq() + tester.call_wikiq("--stdout", arguments, out=False) except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_WP_namespaces(self): - tester = WikiqTester(IKWIKI, "namespaces") - - try: - tester.call_wikiq("-n 0", "-n 1") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - # as a test let's make sure that we get equal data frames - test = pd.read_table(tester.output) - num_wrong_ns = sum(~ test.namespace.isin({0, 1})) - self.assertEqual(num_wrong_ns, 0) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_WP_revert_radius(self): - tester = WikiqTester(IKWIKI, "revert_radius") - - try: - tester.call_wikiq("-n 0", "-n 1", "-rr 1") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - # as a test let's make sure that we get equal data frames - test = pd.read_table(tester.output) - num_wrong_ns = sum(~ test.namespace.isin({0, 1})) - self.assertEqual(num_wrong_ns, 0) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_WP_no_revert_radius(self): - tester = WikiqTester(IKWIKI, "no_revert_radius") - - try: - tester.call_wikiq("-rr 0") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - # as a test let's make sure that we get equal data frames - test = pd.read_table(tester.output) - num_reverted = sum(i is None for i in test.revert) - self.assertEqual(num_reverted, 0) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_WP_collapse_user(self): - tester = WikiqTester(IKWIKI, "collapse_user") - - try: - tester.call_wikiq("--collapse-user") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_noargs(self): - tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") - - try: - tester.call_wikiq() - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_collapse_user(self): - tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") - - try: - tester.call_wikiq("--collapse-user", "--fandom-2020") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_pwr_wikidiff2(self): - tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z") - - try: - tester.call_wikiq("--persistence wikidiff2", "--fandom-2020") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_pwr_segment(self): - tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") - - try: - tester.call_wikiq("--persistence segment", "--fandom-2020") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_pwr_legacy(self): - tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") - - try: - tester.call_wikiq("--persistence legacy", "--fandom-2020") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_pwr(self): - tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") - - try: - tester.call_wikiq("--persistence", "--fandom-2020") - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - baseline = pd.read_table(tester.baseline_file) - - test = test.reindex(columns=sorted(test.columns)) - assert_frame_equal(test, baseline, check_like=True) - - def test_malformed_noargs(self): - tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z") - want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' - - try: - tester.call_wikiq() - except subprocess.CalledProcessError as exc: - errlines = exc.stderr.decode("utf8").splitlines() - self.assertEqual(errlines[-1], want_exception) + # we want to check that the bad arguments were caught and sys.exit is stopping the code + print(exc.stderr.decode("utf-8")) else: - self.fail("No exception raised, want: {}".format(want_exception)) + pytest.fail("No exception raised, want Exception") - def test_stdout_noargs(self): - tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") +def test_good_regex(): + # sample arguments for checking the outcomes of good arguments / test_basic_regex + good_arguments_list = [ + "-RP '\\b\\d{3}\\b' -RPl threedigits", + "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", + "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", + "-CP 'WP:EVADE' -CPl wp_evade", + ] + + for i, arguments in enumerate(good_arguments_list): + tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) try: - outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode("utf8") + tester.call_wikiq(arguments) except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) + + baseline = pd.read_table(tester.baseline_file) + assert_frame_equal(test, baseline, check_like=True) + print(i) + +def test_capturegroup_regex(): + cap_arguments_list = [ + "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", + "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov", + ] + + for i, arguments in enumerate(cap_arguments_list): + tester = WikiqTester( + wiki=REGEXTEST, case_name="capturegroup", suffix=str(i) + ) + + try: + tester.call_wikiq(arguments) + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_table(tester.output) - test = pd.read_table(StringIO(outs)) baseline = pd.read_table(tester.baseline_file) assert_frame_equal(test, baseline, check_like=True) - def test_bad_regex(self): - tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex") +def test_parquet(): + tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") - # sample arguments for checking that bad arguments get terminated / test_regex_arguments - bad_arguments_list = [ - # label is missing - "-RP '\\b\\d+\\b'", - # number of reg and number of labels do not match - "-RP 'NPO V' -RP THE -RPl testlabel", - # cp but rp label - "-CP '(Tamil|Li)' -RPl testlabel", - # regex is missing - "-CPl testlabel", - "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" - ] + try: + tester.call_wikiq() + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) - for arguments in bad_arguments_list: - try: - tester.call_wikiq("--stdout", arguments, out=False) - except subprocess.CalledProcessError as exc: - # we want to check that the bad arguments were caught and sys.exit is stopping the code - print(exc.stderr.decode("utf-8")) - else: - self.fail("No exception raised, want Exception") + # as a test let's make sure that we get equal data frames + test: DataFrame = pd.read_parquet(tester.output) + # test = test.drop(['reverteds'], axis=1) - def test_good_regex(self): - # sample arguments for checking the outcomes of good arguments / test_basic_regex - good_arguments_list = [ - "-RP '\\b\\d{3}\\b' -RPl threedigits", - "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", - "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", - "-CP 'WP:EVADE' -CPl wp_evade" - ] + baseline: DataFrame = pd.read_table(tester.baseline_file) - for i, arguments in enumerate(good_arguments_list): - tester = WikiqTester(wiki=REGEXTEST, case_name="basic", suffix=str(i)) + # Pandas does not read timestamps as the desired datetime type. + baseline["date_time"] = pd.to_datetime(baseline["date_time"]) + # Split strings to the arrays of reverted IDs so they can be compared. + baseline["revert"] = baseline["revert"].replace(np.nan, None) + baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None) + # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] + baseline["sha1"] = baseline["sha1"].replace(np.nan, None) + baseline["editor"] = baseline["editor"].replace(np.nan, None) + baseline["anon"] = baseline["anon"].replace(np.nan, None) - try: - tester.call_wikiq(arguments) - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - print(i) - - def test_capturegroup_regex(self): - cap_arguments_list = [ - "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P\\b[a-zA-Z]{3}\\b)|(?P\\b\\d+\\b)|(?P\\bcat\\b)' -CPl three", - "-CP '(?P\\bTestCaseA\\b)|(?P\\bTestCaseB\\b)|(?P\\bTestCaseC\\b)|(?P\\bTestCaseD\\b)' -CPl testcase -RP '(?Pnpov|NPOV)|(?Pneutral point of view)' -RPl npov" - ] - - for i, arguments in enumerate(cap_arguments_list): - tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)) - - try: - tester.call_wikiq(arguments) - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) - - test = pd.read_table(tester.output) - - baseline = pd.read_table(tester.baseline_file) - assert_frame_equal(test, baseline, check_like=True) - - def test_parquet(self): - tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") + for index, row in baseline.iterrows(): + if row["revert"] != test["revert"][index]: + print(row["revid"], ":", row["revert"], "!=", test["revert"][index]) + for col in baseline.columns: try: - tester.call_wikiq() - except subprocess.CalledProcessError as exc: - self.fail(exc.stderr.decode("utf8")) + assert_series_equal( + test[col], baseline[col], check_like=True, check_dtype=False + ) + except ValueError as exc: + print(f"Error comparing column {col}") + pytest.fail(exc) - # as a test let's make sure that we get equal data frames - test: DataFrame = pd.read_parquet(tester.output) - # test = test.drop(['reverteds'], axis=1) - - baseline: DataFrame = pd.read_table(tester.baseline_file) - - # Pandas does not read timestamps as the desired datetime type. - baseline['date_time'] = pd.to_datetime(baseline['date_time']) - # Split strings to the arrays of reverted IDs so they can be compared. - baseline['revert'] = baseline['revert'].replace(np.nan, None) - baseline['reverteds'] = baseline['reverteds'].replace(np.nan, None) - # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] - baseline['sha1'] = baseline['sha1'].replace(np.nan, None) - baseline['editor'] = baseline['editor'].replace(np.nan, None) - baseline['anon'] = baseline['anon'].replace(np.nan, None) - - for index, row in baseline.iterrows(): - if row['revert'] != test['revert'][index]: - print(row['revid'], ":", row['revert'], "!=", test['revert'][index]) - - for col in baseline.columns: - try: - assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) - except ValueError as exc: - print(f"Error comparing column {col}") - self.fail(exc) - - # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) - - -if __name__ == '__main__': - unittest.main() + # assert_frame_equal(test, baseline, check_like=True, check_dtype=False) diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py index 915a3bf..a6de9ae 100644 --- a/wiki_diff_matcher.py +++ b/wiki_diff_matcher.py @@ -16,10 +16,10 @@ import pywikidiff2 class DiffToOperationMap: def __init__(self, diff, tokenizer): self.tokenizer = tokenizer - self.diff = json.loads(diff) self.from_par_move_dict = {} self.to_par_move_dict = {} self.highlights_without_offset = [] + self.diff = diff # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() @@ -375,7 +375,7 @@ class WikiDiffMatcher: numContextLines=1000000, moved_paragraph_detection_cutoff=200000 ) # Pre-compute diffs to reduce traffic overhead. - self.diffs = differ.inline_json_diff_sequence(list(texts)) + self.diffs = [json.loads(diff) for diff in differ.inline_json_diff_sequence(list(texts))] self.tokenizer = tokenizer or TOKENIZER class Processor(DiffEngine.Processor): diff --git a/wikiq b/wikiq index 5039c9c..6addada 100755 --- a/wikiq +++ b/wikiq @@ -208,6 +208,8 @@ class WikiqParser: regex_match_comment: list[str], regex_revision_label: list[str], regex_comment_label: list[str], + text: bool = False, + diff: bool = False, collapse_user: bool = False, persist: int = None, namespaces: Union[list[int], None] = None, @@ -226,7 +228,8 @@ class WikiqParser: self.persist: int = persist self.namespaces = [] self.revert_radius = revert_radius - + self.diff = diff + self.text = text if namespaces is not None: self.namespace_filter = set(namespaces) else: @@ -331,6 +334,9 @@ class WikiqParser: tables.RevisionIsAnon(), ]) + if self.text: + table.columns.append(tables.RevisionText()) + if self.collapse_user: table.columns.append(tables.RevisionCollapsed()) @@ -345,6 +351,10 @@ class WikiqParser: schema = table.schema() schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) + if self.diff: + from diff_pyarrow_schema import diff_field + schema = schema.append(diff_field) + # Add regex fields to the schema. for pair in self.regex_revision_pairs: for field in pair.get_pyarrow_fields(): @@ -412,9 +422,18 @@ class WikiqParser: revision_texts.append(rev.text) + wikidiff_matcher = None + if self.diff or self.persist == PersistMethod.wikidiff2: + wikidiff_matcher = WikiDiffMatcher(revision_texts, + tokenizer=wikitext_split, + ) + # Collect the set of pages currently buffered in the table so we can run multi-page functions on them. row_buffer = table.pop() + if self.diff: + row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts))] + is_revert_column: list[Union[bool, None]] = [] for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): if self.revert_radius == 0 or d: @@ -428,6 +447,7 @@ class WikiqParser: row_buffer[k] = v regex_matches = {} + if self.persist != PersistMethod.none: window = deque(maxlen=PERSISTENCE_RADIUS) @@ -443,9 +463,8 @@ class WikiqParser: state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), revert_radius=PERSISTENCE_RADIUS) elif self.persist == PersistMethod.wikidiff2: - state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, - tokenizer=wikitext_split, - ), + + state = mwpersistence.DiffState(wikidiff_matcher, revert_radius=PERSISTENCE_RADIUS) else: from mw.lib import persistence @@ -469,8 +488,6 @@ class WikiqParser: row_buffer['tokens_removed'].append(len(old_tokens_removed)) row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) - del row_buffer['text'] - # print out metadata for the last RADIUS revisions for i, item in enumerate(window): # if the window was full, we've already printed item 0 @@ -485,6 +502,9 @@ class WikiqParser: row_buffer['tokens_removed'].append(len(tokens_removed)) row_buffer['tokens_window'].append(len(window) - (i + 1)) + if not self.text: + del row_buffer['text'] + writer.write(pa.table(row_buffer, schema=schema)) page_count += 1 @@ -494,7 +514,6 @@ class WikiqParser: writer.close() - def match_archive_suffix(input_filename): if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename] @@ -580,6 +599,14 @@ def main(): action='append', help="The label for the outputted column based on matching the regex in comments.") + parser.add_argument('-d', '--diff', dest="diff", default=False, + action='store_true', + help="Output a diff structure for each revision with information about changed or moved lines.") + + parser.add_argument('-t', '--text', dest="text", default=False, + action='store_true', + help="Output the text of the revision.") + parser.add_argument('--fandom-2020', dest="fandom_2020", action='store_true', help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") @@ -604,6 +631,7 @@ def main(): else: namespaces = None + print(args, file=sys.stderr) if len(args.dumpfiles) > 0: for filename in args.dumpfiles: input_file = open_input_file(filename, args.fandom_2020) @@ -637,6 +665,8 @@ def main(): regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, regex_comment_label=args.regex_comment_label, + text=args.text, + diff=args.diff, output_parquet=output_parquet, ) @@ -656,7 +686,9 @@ def main(): regex_match_revision=args.regex_match_revision, regex_revision_label=args.regex_revision_label, regex_match_comment=args.regex_match_comment, - regex_comment_label=args.regex_comment_label) + regex_comment_label=args.regex_comment_label, + diff=args.diff, + text=args.text) wikiq.process()