WIP: fixing bugs and adding newlines to output.

This commit is contained in:
Nathan TeBlunthuis
2025-07-02 13:31:32 -07:00
parent c4acc711d2
commit cf1fb61a84
3 changed files with 297 additions and 97 deletions

View File

@@ -8,30 +8,36 @@ import pytest_asyncio
from typing import List
from deltas import Delete, Equal, Insert, wikitext_split
from mwpersistence import Token
from wiki_diff_matcher import WikiDiffMatcher
@pytest_asyncio.fixture(scope="module")
@pytest_asyncio.fixture(scope="module", autouse=True)
async def start_stop_server():
print("starting server")
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
"wikidiff2_api.php",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# php needs a moment to actually start
await asyncio.sleep(0.1)
yield proc
stdout, stderr = await proc.communicate()
print(stdout.encode())
print(stderr.encode())
print("stopping server")
proc.terminate()
stdout, stderr = await proc.communicate()
print(stdout.decode())
print(stderr.decode())
def assert_equal_enough(tokens:List[Token], rev):
# the tokens exclude newlines
# we allow extra whitespace at the beginning or end
token_doc = ''.join(str(t) for t in tokens).strip()
rev_doc = rev.replace('\n','').strip()
while '\n\n' in token_doc:
token_doc = token_doc.replace('\n\n','\n')
while '\n\n' in rev:
rev = rev.replace('\n\n','\n').strip()
print(token_doc, file = open('token','w'))
print(rev_doc, file = open('rev','w'))
assert token_doc == rev_doc
print(rev, file = open('rev','w'))
assert token_doc == rev
def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_tokens):
@@ -70,15 +76,59 @@ def test_equality():
diff_processor = matcher.processor()
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev1 + " ")
assert len(ops) == 129
for op in ops[:-1]:
assert isinstance(op, Equal)
assert len(ops) == 258
for op in ops[:-2]:
print(op)
assert isinstance(op, Equal)
# note that the whitespace token does not result in a token according to wikitext_split
# compare the tokens based on the diffs to the baseline
# whitespace differences are allowed
assert_equal_enough(b, rev1)
def test_highlight_range_3():
rev1 = open("test/test_diff_revisions/test_highlight_3_from").read()
rev2 = open("test/test_diff_revisions/test_highlight_3_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_highlight_range_4():
rev1 = open("test/test_diff_revisions/test_highlight_4_from").read()
rev2 = open("test/test_diff_revisions/test_highlight_4_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_complex_diff():
rev1 = open("test/test_diff_revisions/test_complex_from").read()
rev2 = open("test/test_diff_revisions/test_complex_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_highlight_range_unicode():
rev1 = open("test/test_diff_revisions/test_unicode_highlight_from").read()
rev2 = open("test/test_diff_revisions/test_unicode_highlight_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_highlight_range():
rev1 = open("test/test_diff_revisions/1295229484_rangeedit0").read()
rev2 = open("test/test_diff_revisions/1295229484_rangeedit1").read()
@@ -108,28 +158,38 @@ def test_delete():
n_deleted_tokens = 0
last_b2 = initial_equal_tokens
initial_equal_lines = 2
initial_equal_tokens = 12
initial_equal_lines = 4
initial_equal_tokens = 14
last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens)
first_noninsert_token = initial_equal_tokens
last_non_delete = False
idx = 0
for op in ops[initial_equal_lines:]:
idx += 1
# deletes are interleaved with Equal newlines.
if not isinstance(op, Delete):
first_nondelete_token = op.a1
break
n_deletes += 1
n_deleted_tokens += op.a2 - last_b2
last_b2 = op.a2
if last_non_delete:
first_nondelete_token = op.a1
break
last_non_delete = True
else:
last_non_delete = False
if last_non_delete:
n_deletes += 1
n_deleted_tokens += op.a2 - last_b2
last_b2 = op.a2
assert n_deletes == 2
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 316
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + n_deletes:],
expected_equal_lines=126,
expected_equal_tokens=9323)
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
expected_equal_lines=252,
expected_equal_tokens=9765)
# first lets test that we properly build the operations.
@@ -144,8 +204,14 @@ def test_addition():
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
even = True
for op in ops:
assert isinstance(op, Insert)
if even:
assert isinstance(op, Insert)
even = False
else:
assert isinstance(op, Equal)
even = True
assert_equal_enough(b, rev1)
@@ -155,21 +221,26 @@ def test_addition():
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
ops = list(ops)
initial_equal_lines = 128
initial_equal_tokens = 9359
initial_equal_lines = 256
initial_equal_tokens = 9487
last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens)
last_non_insert = False
first_noninsert_token = None
n_inserts = 0
n_inserted_tokens = 0
last_b2 = initial_equal_tokens
last_b2 = last_insert_b2 = initial_equal_tokens
idx = 0
print(ops[initial_equal_lines:])
for op in ops[initial_equal_lines:]:
n_inserts += 1
n_inserted_tokens += op.b2 - last_b2
if isinstance(op, Insert):
n_inserts += 1
n_inserted_tokens += op.b2 - op.b1
last_insert_b2 = op.b2
last_b2 = op.b2
assert n_inserted_tokens == last_b2 - initial_equal_tokens == 292
assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
assert n_inserts == 2
def test_paragraph_move():
@@ -195,6 +266,26 @@ def test_paragraph_move_and_change():
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
# slow test
def test_diff_consistency():
from mwxml import Dump
stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout
dump = Dump.from_file(stream)
for page in dump:
revisions = [rev.text for rev in page if rev.text]
matcher = WikiDiffMatcher(revisions)
diff_processor = matcher.processor()
last_rev = ""
for rev in revisions:
print(rev, file=open("test_unicode_highlight_to",'w'))
print(last_rev, file=open("test_unicode_highlight_from",'w'))
ops, a, b = diff_processor.process(rev)
#assert_equal_enough(a, last_rev)
assert_equal_enough(b, rev)
last_rev = rev