almost there. working out edge cases.

This commit is contained in:
Nathan TeBlunthuis
2025-07-03 21:32:44 -07:00
parent cf1fb61a84
commit 4654911533
3 changed files with 345 additions and 231 deletions

View File

@@ -2,7 +2,7 @@
import asyncio
import subprocess
from functools import partial
import re
import pytest
import pytest_asyncio
from typing import List
@@ -30,11 +30,10 @@ async def start_stop_server():
def assert_equal_enough(tokens:List[Token], rev):
# the tokens exclude newlines
# we allow extra whitespace at the beginning or end
token_doc = ''.join(str(t) for t in tokens).strip()
while '\n\n' in token_doc:
token_doc = token_doc.replace('\n\n','\n')
while '\n\n' in rev:
rev = rev.replace('\n\n','\n').strip()
token_doc = ''.join(str(t) for t in tokens)
token_doc = re.sub(r'\s+', ' ', token_doc).strip()
rev = re.sub(r'\s+', ' ', rev).strip()
print(token_doc, file = open('token','w'))
print(rev, file = open('rev','w'))
assert token_doc == rev
@@ -62,7 +61,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token
# if the last line is an equal
if first_unequal_token is None:
first_unequal_token = ops[-1].b2
assert n_equal_lines == expected_equal_lines
# check that there are no gaps and the number is as expected
@@ -76,9 +74,8 @@ def test_equality():
diff_processor = matcher.processor()
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev1 + " ")
assert len(ops) == 258
assert len(ops) == 257
for op in ops[:-2]:
print(op)
assert isinstance(op, Equal)
# note that the whitespace token does not result in a token according to wikitext_split
@@ -152,44 +149,48 @@ def test_delete():
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
initial_equal_tokens = 0
first_nondelete_token = None
n_deletes = 0
n_deleted_tokens = 0
last_b2 = initial_equal_tokens
initial_equal_lines = 256
initial_equal_tokens = 9911
for i, op in enumerate(ops):
if initial_equal_lines > 0:
assert isinstance(op, Equal)
else:
break
initial_equal_lines -= 1
assert initial_equal_lines == 0
assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens
initial_equal_lines = 4
initial_equal_tokens = 14
last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens)
first_noninsert_token = initial_equal_tokens
last_non_delete = False
last_delete = False
last_insert = False
idx = 0
n_non_delete = 0
last_delete_idx = 0
for op in ops[initial_equal_lines:]:
idx += 1
# deletes are interleaved with Equal newlines.
if not isinstance(op, Delete):
if last_non_delete:
first_nondelete_token = op.a1
break
last_non_delete = True
else:
last_non_delete = False
if last_non_delete:
if isinstance(op, Delete):
n_deletes += 1
n_deleted_tokens += op.a2 - last_b2
last_b2 = op.a2
assert n_deletes == 2
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
n_deleted_tokens += op.a2 - op.a1
last_delete = True
last_delete_idx = idx
# we need to add back a newline when we have a delete
else:
n_non_delete += 1
if not last_delete and first_nondelete_token is None:
first_nondelete_token = op.a1
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
expected_equal_lines=252,
expected_equal_tokens=9765)
if n_non_delete:
last_b2 = op.b2
assert n_deletes == 4
assert n_deleted_tokens == 320
assert idx == len(ops)
# first lets test that we properly build the operations.
@@ -204,14 +205,8 @@ def test_addition():
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
even = True
for op in ops:
if even:
assert isinstance(op, Insert)
even = False
else:
assert isinstance(op, Equal)
even = True
assert isinstance(op, Insert)
assert_equal_enough(b, rev1)
@@ -221,8 +216,8 @@ def test_addition():
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
ops = list(ops)
initial_equal_lines = 256
initial_equal_tokens = 9487
initial_equal_lines = 255
initial_equal_tokens = 9614
last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens)
@@ -232,16 +227,21 @@ def test_addition():
n_inserted_tokens = 0
last_b2 = last_insert_b2 = initial_equal_tokens
idx = 0
print(ops[initial_equal_lines:])
last_insert = False
for op in ops[initial_equal_lines:]:
if isinstance(op, Insert):
n_inserts += 1
n_inserted_tokens += op.b2 - op.b1
last_insert_b2 = op.b2
last_insert = True
elif last_insert:
assert isinstance(op, Equal)
last_b2 = op.b2
assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
assert n_inserts == 2
assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296
assert n_inserts == 4
def test_paragraph_move():
rev1 = open("test/test_diff_revisions/1295229484").read()
@@ -269,6 +269,63 @@ def test_paragraph_move_and_change():
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_infobox():
rev1 = open("test/test_diff_revisions/test_infobox_from").read()
rev2 = open("test/test_diff_revisions/test_infobox_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
# note that a and b are constructed from the diffs.
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
def test_leading_whitespace():
rev1 = open("test/test_diff_revisions/test_leading_ws_from").read()
rev2 = open("test/test_diff_revisions/test_leading_ws_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
# note that a and b are constructed from the diffs.
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
# def test_whitespace_2():
# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
# matcher = WikiDiffMatcher([rev1,rev2])
# diff_processor = matcher.processor()
# # note that a and b are constructed from the diffs.
# # so they reflect the state of the text according to the diff processor
# ops, a, b = diff_processor.process(rev1)
# ops, a, b = diff_processor.process(rev2)
# assert_equal_enough(b, rev2)
# assert_equal_enough(a, rev1)
def test_actually_equal():
rev1 = open("test/test_diff_revisions/1285792388").read()
# whitespace is added because exact identity reverts do not result in diffs.
matcher = WikiDiffMatcher([rev1,rev1])
diff_processor = matcher.processor()
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev1)
assert len(ops) == 1
assert isinstance(ops[0], Equal)
# note that the whitespace token does not result in a token according to wikitext_split
# compare the tokens based on the diffs to the baseline
# whitespace differences are allowed
assert_equal_enough(b, rev1)
assert_equal_enough(a, rev1)
# slow test
def test_diff_consistency():
from mwxml import Dump