add (optional) diff and text columns to output.

This commit is contained in:
Nathan TeBlunthuis 2025-07-07 14:39:52 -07:00
parent a8e9e7f4fd
commit d6c4c0a416
4 changed files with 373 additions and 295 deletions

View File

@ -218,3 +218,11 @@ class RevisionCollapsed(RevisionField[int]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> int:
return len(revisions) return len(revisions)
class RevisionText(RevisionField[str]):
field = pa.field("text", pa.string(), nullable=False)
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
return revision.text

View File

@ -1,15 +1,14 @@
import shutil
import unittest
import os import os
import shutil
import subprocess import subprocess
import tracemalloc
from io import StringIO
from typing import Final, Union
import pytest
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from pandas import DataFrame from pandas import DataFrame
from pandas.testing import assert_frame_equal, assert_series_equal from pandas.testing import assert_frame_equal, assert_series_equal
from io import StringIO
import tracemalloc
from typing import Final, Union
# Make references to files and wikiq relative to this file, not to the current working directory. # Make references to files and wikiq relative to this file, not to the current working directory.
TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__)) TEST_DIR: Final[str] = os.path.dirname(os.path.realpath(__file__))
@ -37,7 +36,8 @@ setup()
class WikiqTester: class WikiqTester:
def __init__(self, def __init__(
self,
wiki: str, wiki: str,
case_name: str, case_name: str,
suffix: Union[str, None] = None, suffix: Union[str, None] = None,
@ -45,13 +45,17 @@ class WikiqTester:
baseline_format: str = "tsv", baseline_format: str = "tsv",
out_format: str = "tsv", out_format: str = "tsv",
): ):
self.input_file = os.path.join(TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)) self.input_file = os.path.join(
TEST_DIR, "dumps", "{0}.xml.{1}".format(wiki, in_compression)
)
basename = "{0}_{1}".format(case_name, wiki) basename = "{0}_{1}".format(case_name, wiki)
if suffix: if suffix:
basename = "{0}_{1}".format(basename, suffix) basename = "{0}_{1}".format(basename, suffix)
self.output = os.path.join(TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format)) self.output = os.path.join(
TEST_OUTPUT_DIR, "{0}.{1}".format(basename, out_format)
)
if os.path.exists(self.output): if os.path.exists(self.output):
if os.path.isfile(self.output): if os.path.isfile(self.output):
@ -66,12 +70,16 @@ class WikiqTester:
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format) self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)
self.wikiq_out_name = "{0}.{1}".format(wiki, out_format) self.wikiq_out_name = "{0}.{1}".format(wiki, out_format)
else: else:
self.wikiq_baseline_name = "{0}_{1}.{2}".format(wiki, suffix, baseline_format) self.wikiq_baseline_name = "{0}_{1}.{2}".format(
wiki, suffix, baseline_format
)
self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format) self.wikiq_out_name = "{0}_{1}.{2}".format(wiki, suffix, out_format)
# If case_name is unset, there are no relevant baseline or test files. # If case_name is unset, there are no relevant baseline or test files.
if case_name is not None: if case_name is not None:
self.baseline_file = os.path.join(BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)) self.baseline_file = os.path.join(
BASELINE_DIR, "{0}_{1}".format(case_name, self.wikiq_baseline_name)
)
def call_wikiq(self, *args: str, out: bool = True): def call_wikiq(self, *args: str, out: bool = True):
""" """
@ -81,14 +89,13 @@ class WikiqTester:
:return: The output of the wikiq call. :return: The output of the wikiq call.
""" """
if out: if out:
call = ' '.join([WIKIQ, self.input_file, "-o", self.output, *args]) call = " ".join([WIKIQ, self.input_file, "-o", self.output, *args])
else: else:
call = ' '.join([WIKIQ, self.input_file, *args]) call = " ".join([WIKIQ, self.input_file, *args])
print(call) print(call)
return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True) return subprocess.check_output(call, stderr=subprocess.PIPE, shell=True)
# with / without pwr DONE # with / without pwr DONE
# with / without url encode DONE # with / without url encode DONE
# with / without collapse user DONE # with / without collapse user DONE
@ -99,26 +106,25 @@ class WikiqTester:
# wikia and wikipedia data DONE # wikia and wikipedia data DONE
# malformed xmls DONE # malformed xmls DONE
class WikiqTestCase(unittest.TestCase): def test_WP_noargs():
def test_WP_noargs(self):
tester = WikiqTester(IKWIKI, "noargs") tester = WikiqTester(IKWIKI, "noargs")
try: try:
tester.call_wikiq() tester.call_wikiq()
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_namespaces(self): def test_WP_namespaces():
tester = WikiqTester(IKWIKI, "namespaces") tester = WikiqTester(IKWIKI, "namespaces")
try: try:
tester.call_wikiq("-n 0", "-n 1") tester.call_wikiq("-n 0", "-n 1")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
@ -127,13 +133,13 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_revert_radius(self): def test_WP_revert_radius():
tester = WikiqTester(IKWIKI, "revert_radius") tester = WikiqTester(IKWIKI, "revert_radius")
try: try:
tester.call_wikiq("-n 0", "-n 1", "-rr 1") tester.call_wikiq("-n 0", "-n 1", "-rr 1")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
@ -142,13 +148,13 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_no_revert_radius(self): def test_WP_no_revert_radius():
tester = WikiqTester(IKWIKI, "no_revert_radius") tester = WikiqTester(IKWIKI, "no_revert_radius")
try: try:
tester.call_wikiq("-rr 0") tester.call_wikiq("-rr 0")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
@ -157,85 +163,85 @@ class WikiqTestCase(unittest.TestCase):
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_WP_collapse_user(self): def test_WP_collapse_user():
tester = WikiqTester(IKWIKI, "collapse_user") tester = WikiqTester(IKWIKI, "collapse_user")
try: try:
tester.call_wikiq("--collapse-user") tester.call_wikiq("--collapse-user")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_noargs(self): def test_noargs():
tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z") tester = WikiqTester(SAILORMOON, "noargs", in_compression="7z")
try: try:
tester.call_wikiq() tester.call_wikiq()
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_collapse_user(self): def test_collapse_user():
tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z") tester = WikiqTester(SAILORMOON, "collapse-user", in_compression="7z")
try: try:
tester.call_wikiq("--collapse-user", "--fandom-2020") tester.call_wikiq("--collapse-user", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_pwr_wikidiff2(self): def test_pwr_wikidiff2():
tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z") tester = WikiqTester(SAILORMOON, "persistence_wikidiff2", in_compression="7z")
try: try:
tester.call_wikiq("--persistence wikidiff2", "--fandom-2020") tester.call_wikiq("--persistence wikidiff2", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_parquet(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_parquet(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_pwr_segment(self): def test_pwr_segment():
tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z") tester = WikiqTester(SAILORMOON, "persistence_segment", in_compression="7z")
try: try:
tester.call_wikiq("--persistence segment", "--fandom-2020") tester.call_wikiq("--persistence segment", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_pwr_legacy(self): def test_pwr_legacy():
tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z") tester = WikiqTester(SAILORMOON, "persistence_legacy", in_compression="7z")
try: try:
tester.call_wikiq("--persistence legacy", "--fandom-2020") tester.call_wikiq("--persistence legacy", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_pwr(self): def test_pwr():
tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z") tester = WikiqTester(SAILORMOON, "persistence", in_compression="7z")
try: try:
tester.call_wikiq("--persistence", "--fandom-2020") tester.call_wikiq("--persistence", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
@ -243,9 +249,39 @@ class WikiqTestCase(unittest.TestCase):
test = test.reindex(columns=sorted(test.columns)) test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_malformed_noargs(self): def test_diff():
tester = WikiqTester(SAILORMOON, "diff", in_compression="7z", out_format='parquet', baseline_format='parquet')
try:
tester.call_wikiq("--diff", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
baseline = pd.read_parquet(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
def test_text():
tester = WikiqTester(SAILORMOON, "text", in_compression="7z", out_format='parquet', baseline_format='parquet')
try:
tester.call_wikiq("--diff", "--text","--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
baseline = pd.read_parquet(tester.baseline_file)
test = test.reindex(columns=sorted(test.columns))
assert_frame_equal(test, baseline, check_like=True)
def test_malformed_noargs():
tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z") tester = WikiqTester(wiki=TWINPEAKS, case_name="noargs", in_compression="7z")
want_exception = 'xml.etree.ElementTree.ParseError: no element found: line 1369, column 0' want_exception = (
"xml.etree.ElementTree.ParseError: no element found: line 1369, column 0"
)
try: try:
tester.call_wikiq() tester.call_wikiq()
@ -253,21 +289,23 @@ class WikiqTestCase(unittest.TestCase):
errlines = exc.stderr.decode("utf8").splitlines() errlines = exc.stderr.decode("utf8").splitlines()
self.assertEqual(errlines[-1], want_exception) self.assertEqual(errlines[-1], want_exception)
else: else:
self.fail("No exception raised, want: {}".format(want_exception)) pytest.fail("No exception raised, want: {}".format(want_exception))
def test_stdout_noargs(self): def test_stdout_noargs():
tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z") tester = WikiqTester(wiki=SAILORMOON, case_name="noargs", in_compression="7z")
try: try:
outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode("utf8") outs = tester.call_wikiq("--stdout", "--fandom-2020", out=False).decode(
"utf8"
)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(StringIO(outs)) test = pd.read_table(StringIO(outs))
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_bad_regex(self): def test_bad_regex():
tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex") tester = WikiqTester(wiki=REGEXTEST, case_name="bad_regex")
# sample arguments for checking that bad arguments get terminated / test_regex_arguments # sample arguments for checking that bad arguments get terminated / test_regex_arguments
@ -280,7 +318,7 @@ class WikiqTestCase(unittest.TestCase):
"-CP '(Tamil|Li)' -RPl testlabel", "-CP '(Tamil|Li)' -RPl testlabel",
# regex is missing # regex is missing
"-CPl testlabel", "-CPl testlabel",
"-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'" "-RP '\\b\\w{3}\\b' -RPl threeletters -CP '\\b\\w{3}\\b'",
] ]
for arguments in bad_arguments_list: for arguments in bad_arguments_list:
@ -290,15 +328,15 @@ class WikiqTestCase(unittest.TestCase):
# we want to check that the bad arguments were caught and sys.exit is stopping the code # we want to check that the bad arguments were caught and sys.exit is stopping the code
print(exc.stderr.decode("utf-8")) print(exc.stderr.decode("utf-8"))
else: else:
self.fail("No exception raised, want Exception") pytest.fail("No exception raised, want Exception")
def test_good_regex(self): def test_good_regex():
# sample arguments for checking the outcomes of good arguments / test_basic_regex # sample arguments for checking the outcomes of good arguments / test_basic_regex
good_arguments_list = [ good_arguments_list = [
"-RP '\\b\\d{3}\\b' -RPl threedigits", "-RP '\\b\\d{3}\\b' -RPl threedigits",
"-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word", "-RP 'TestCase' -RP 'page' -RPl testcases -RPl page_word",
"-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning", "-CP 'Chevalier' -CPl chev_com -RP 'welcome to Wikipedia' -RPl wiki_welcome -CP 'Warning' -CPl warning",
"-CP 'WP:EVADE' -CPl wp_evade" "-CP 'WP:EVADE' -CPl wp_evade",
] ]
for i, arguments in enumerate(good_arguments_list): for i, arguments in enumerate(good_arguments_list):
@ -307,7 +345,7 @@ class WikiqTestCase(unittest.TestCase):
try: try:
tester.call_wikiq(arguments) tester.call_wikiq(arguments)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
@ -315,32 +353,34 @@ class WikiqTestCase(unittest.TestCase):
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
print(i) print(i)
def test_capturegroup_regex(self): def test_capturegroup_regex():
cap_arguments_list = [ cap_arguments_list = [
"-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three", "-RP 'Li Chevalier' -RPl li_cheval -CP '(?P<letter>\\b[a-zA-Z]{3}\\b)|(?P<number>\\b\\d+\\b)|(?P<cat>\\bcat\\b)' -CPl three",
"-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov" "-CP '(?P<a>\\bTestCaseA\\b)|(?P<b>\\bTestCaseB\\b)|(?P<c>\\bTestCaseC\\b)|(?P<d>\\bTestCaseD\\b)' -CPl testcase -RP '(?P<npov>npov|NPOV)|(?P<neutral>neutral point of view)' -RPl npov",
] ]
for i, arguments in enumerate(cap_arguments_list): for i, arguments in enumerate(cap_arguments_list):
tester = WikiqTester(wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)) tester = WikiqTester(
wiki=REGEXTEST, case_name="capturegroup", suffix=str(i)
)
try: try:
tester.call_wikiq(arguments) tester.call_wikiq(arguments)
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_table(tester.output) test = pd.read_table(tester.output)
baseline = pd.read_table(tester.baseline_file) baseline = pd.read_table(tester.baseline_file)
assert_frame_equal(test, baseline, check_like=True) assert_frame_equal(test, baseline, check_like=True)
def test_parquet(self): def test_parquet():
tester = WikiqTester(IKWIKI, "noargs", out_format="parquet") tester = WikiqTester(IKWIKI, "noargs", out_format="parquet")
try: try:
tester.call_wikiq() tester.call_wikiq()
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
self.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
# as a test let's make sure that we get equal data frames # as a test let's make sure that we get equal data frames
test: DataFrame = pd.read_parquet(tester.output) test: DataFrame = pd.read_parquet(tester.output)
@ -349,28 +389,26 @@ class WikiqTestCase(unittest.TestCase):
baseline: DataFrame = pd.read_table(tester.baseline_file) baseline: DataFrame = pd.read_table(tester.baseline_file)
# Pandas does not read timestamps as the desired datetime type. # Pandas does not read timestamps as the desired datetime type.
baseline['date_time'] = pd.to_datetime(baseline['date_time']) baseline["date_time"] = pd.to_datetime(baseline["date_time"])
# Split strings to the arrays of reverted IDs so they can be compared. # Split strings to the arrays of reverted IDs so they can be compared.
baseline['revert'] = baseline['revert'].replace(np.nan, None) baseline["revert"] = baseline["revert"].replace(np.nan, None)
baseline['reverteds'] = baseline['reverteds'].replace(np.nan, None) baseline["reverteds"] = baseline["reverteds"].replace(np.nan, None)
# baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']] # baseline['reverteds'] = [None if i is np.nan else [int(j) for j in str(i).split(",")] for i in baseline['reverteds']]
baseline['sha1'] = baseline['sha1'].replace(np.nan, None) baseline["sha1"] = baseline["sha1"].replace(np.nan, None)
baseline['editor'] = baseline['editor'].replace(np.nan, None) baseline["editor"] = baseline["editor"].replace(np.nan, None)
baseline['anon'] = baseline['anon'].replace(np.nan, None) baseline["anon"] = baseline["anon"].replace(np.nan, None)
for index, row in baseline.iterrows(): for index, row in baseline.iterrows():
if row['revert'] != test['revert'][index]: if row["revert"] != test["revert"][index]:
print(row['revid'], ":", row['revert'], "!=", test['revert'][index]) print(row["revid"], ":", row["revert"], "!=", test["revert"][index])
for col in baseline.columns: for col in baseline.columns:
try: try:
assert_series_equal(test[col], baseline[col], check_like=True, check_dtype=False) assert_series_equal(
test[col], baseline[col], check_like=True, check_dtype=False
)
except ValueError as exc: except ValueError as exc:
print(f"Error comparing column {col}") print(f"Error comparing column {col}")
self.fail(exc) pytest.fail(exc)
# assert_frame_equal(test, baseline, check_like=True, check_dtype=False) # assert_frame_equal(test, baseline, check_like=True, check_dtype=False)
if __name__ == '__main__':
unittest.main()

View File

@ -16,10 +16,10 @@ import pywikidiff2
class DiffToOperationMap: class DiffToOperationMap:
def __init__(self, diff, tokenizer): def __init__(self, diff, tokenizer):
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.diff = json.loads(diff)
self.from_par_move_dict = {} self.from_par_move_dict = {}
self.to_par_move_dict = {} self.to_par_move_dict = {}
self.highlights_without_offset = [] self.highlights_without_offset = []
self.diff = diff
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
@ -375,7 +375,7 @@ class WikiDiffMatcher:
numContextLines=1000000, moved_paragraph_detection_cutoff=200000 numContextLines=1000000, moved_paragraph_detection_cutoff=200000
) )
# Pre-compute diffs to reduce traffic overhead. # Pre-compute diffs to reduce traffic overhead.
self.diffs = differ.inline_json_diff_sequence(list(texts)) self.diffs = [json.loads(diff) for diff in differ.inline_json_diff_sequence(list(texts))]
self.tokenizer = tokenizer or TOKENIZER self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor): class Processor(DiffEngine.Processor):

48
wikiq
View File

@ -208,6 +208,8 @@ class WikiqParser:
regex_match_comment: list[str], regex_match_comment: list[str],
regex_revision_label: list[str], regex_revision_label: list[str],
regex_comment_label: list[str], regex_comment_label: list[str],
text: bool = False,
diff: bool = False,
collapse_user: bool = False, collapse_user: bool = False,
persist: int = None, persist: int = None,
namespaces: Union[list[int], None] = None, namespaces: Union[list[int], None] = None,
@ -226,7 +228,8 @@ class WikiqParser:
self.persist: int = persist self.persist: int = persist
self.namespaces = [] self.namespaces = []
self.revert_radius = revert_radius self.revert_radius = revert_radius
self.diff = diff
self.text = text
if namespaces is not None: if namespaces is not None:
self.namespace_filter = set(namespaces) self.namespace_filter = set(namespaces)
else: else:
@ -331,6 +334,9 @@ class WikiqParser:
tables.RevisionIsAnon(), tables.RevisionIsAnon(),
]) ])
if self.text:
table.columns.append(tables.RevisionText())
if self.collapse_user: if self.collapse_user:
table.columns.append(tables.RevisionCollapsed()) table.columns.append(tables.RevisionCollapsed())
@ -345,6 +351,10 @@ class WikiqParser:
schema = table.schema() schema = table.schema()
schema = schema.append(pa.field('revert', pa.bool_(), nullable=True)) schema = schema.append(pa.field('revert', pa.bool_(), nullable=True))
if self.diff:
from diff_pyarrow_schema import diff_field
schema = schema.append(diff_field)
# Add regex fields to the schema. # Add regex fields to the schema.
for pair in self.regex_revision_pairs: for pair in self.regex_revision_pairs:
for field in pair.get_pyarrow_fields(): for field in pair.get_pyarrow_fields():
@ -412,9 +422,18 @@ class WikiqParser:
revision_texts.append(rev.text) revision_texts.append(rev.text)
wikidiff_matcher = None
if self.diff or self.persist == PersistMethod.wikidiff2:
wikidiff_matcher = WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split,
)
# Collect the set of pages currently buffered in the table so we can run multi-page functions on them. # Collect the set of pages currently buffered in the table so we can run multi-page functions on them.
row_buffer = table.pop() row_buffer = table.pop()
if self.diff:
row_buffer['diff'] = [[entry for entry in wikidiff_matcher.diffs[i]['diff'] if entry['type'] != 0 ] for i in range(len(revision_texts))]
is_revert_column: list[Union[bool, None]] = [] is_revert_column: list[Union[bool, None]] = []
for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']): for r, d in zip(row_buffer['reverteds'], row_buffer['deleted']):
if self.revert_radius == 0 or d: if self.revert_radius == 0 or d:
@ -428,6 +447,7 @@ class WikiqParser:
row_buffer[k] = v row_buffer[k] = v
regex_matches = {} regex_matches = {}
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
window = deque(maxlen=PERSISTENCE_RADIUS) window = deque(maxlen=PERSISTENCE_RADIUS)
@ -443,9 +463,8 @@ class WikiqParser:
state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split),
revert_radius=PERSISTENCE_RADIUS) revert_radius=PERSISTENCE_RADIUS)
elif self.persist == PersistMethod.wikidiff2: elif self.persist == PersistMethod.wikidiff2:
state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts,
tokenizer=wikitext_split, state = mwpersistence.DiffState(wikidiff_matcher,
),
revert_radius=PERSISTENCE_RADIUS) revert_radius=PERSISTENCE_RADIUS)
else: else:
from mw.lib import persistence from mw.lib import persistence
@ -469,8 +488,6 @@ class WikiqParser:
row_buffer['tokens_removed'].append(len(old_tokens_removed)) row_buffer['tokens_removed'].append(len(old_tokens_removed))
row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1) row_buffer['tokens_window'].append(PERSISTENCE_RADIUS - 1)
del row_buffer['text']
# print out metadata for the last RADIUS revisions # print out metadata for the last RADIUS revisions
for i, item in enumerate(window): for i, item in enumerate(window):
# if the window was full, we've already printed item 0 # if the window was full, we've already printed item 0
@ -485,6 +502,9 @@ class WikiqParser:
row_buffer['tokens_removed'].append(len(tokens_removed)) row_buffer['tokens_removed'].append(len(tokens_removed))
row_buffer['tokens_window'].append(len(window) - (i + 1)) row_buffer['tokens_window'].append(len(window) - (i + 1))
if not self.text:
del row_buffer['text']
writer.write(pa.table(row_buffer, schema=schema)) writer.write(pa.table(row_buffer, schema=schema))
page_count += 1 page_count += 1
@ -494,7 +514,6 @@ class WikiqParser:
writer.close() writer.close()
def match_archive_suffix(input_filename): def match_archive_suffix(input_filename):
if re.match(r'.*\.7z$', input_filename): if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename] cmd = ["7za", "x", "-so", input_filename]
@ -580,6 +599,14 @@ def main():
action='append', action='append',
help="The label for the outputted column based on matching the regex in comments.") help="The label for the outputted column based on matching the regex in comments.")
parser.add_argument('-d', '--diff', dest="diff", default=False,
action='store_true',
help="Output a diff structure for each revision with information about changed or moved lines.")
parser.add_argument('-t', '--text', dest="text", default=False,
action='store_true',
help="Output the text of the revision.")
parser.add_argument('--fandom-2020', dest="fandom_2020", parser.add_argument('--fandom-2020', dest="fandom_2020",
action='store_true', action='store_true',
help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.") help="Whether the archive is from the fandom 2020 dumps by Wikiteam. These dumps can have multiple .xml files in their archives.")
@ -604,6 +631,7 @@ def main():
else: else:
namespaces = None namespaces = None
print(args, file=sys.stderr)
if len(args.dumpfiles) > 0: if len(args.dumpfiles) > 0:
for filename in args.dumpfiles: for filename in args.dumpfiles:
input_file = open_input_file(filename, args.fandom_2020) input_file = open_input_file(filename, args.fandom_2020)
@ -637,6 +665,8 @@ def main():
regex_revision_label=args.regex_revision_label, regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment, regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label, regex_comment_label=args.regex_comment_label,
text=args.text,
diff=args.diff,
output_parquet=output_parquet, output_parquet=output_parquet,
) )
@ -656,7 +686,9 @@ def main():
regex_match_revision=args.regex_match_revision, regex_match_revision=args.regex_match_revision,
regex_revision_label=args.regex_revision_label, regex_revision_label=args.regex_revision_label,
regex_match_comment=args.regex_match_comment, regex_match_comment=args.regex_match_comment,
regex_comment_label=args.regex_comment_label) regex_comment_label=args.regex_comment_label,
diff=args.diff,
text=args.text)
wikiq.process() wikiq.process()