almost there. working out edge cases.

2025-07-03 21:32:44 -07:00
parent cf1fb61a84
commit 4654911533
3 changed files with 345 additions and 231 deletions
--- a/test/test_wiki_diff_matcher.py
+++ b/test/test_wiki_diff_matcher.py
@@ -2,7 +2,7 @@
 import asyncio
 import subprocess
 from functools import partial
-
+import re
 import pytest
 import pytest_asyncio
 from typing import List
@@ -30,11 +30,10 @@ async def start_stop_server():
 def assert_equal_enough(tokens:List[Token], rev):
    # the tokens exclude newlines
    # we allow extra whitespace at the beginning or end
-    token_doc = ''.join(str(t) for t in tokens).strip()
-    while '\n\n' in token_doc:
-        token_doc = token_doc.replace('\n\n','\n')
-    while '\n\n' in rev:
-        rev = rev.replace('\n\n','\n').strip()
+    token_doc = ''.join(str(t) for t in tokens)
+    token_doc = re.sub(r'\s+', ' ', token_doc).strip()
+    rev = re.sub(r'\s+', ' ', rev).strip()
+
    print(token_doc, file = open('token','w'))
    print(rev, file = open('rev','w'))
    assert token_doc == rev
@@ -62,7 +61,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token
    # if the last line is an equal
    if first_unequal_token is None:
        first_unequal_token = ops[-1].b2
-
    
    assert n_equal_lines == expected_equal_lines
    # check that there are no gaps and the number is as expected
@@ -76,9 +74,8 @@ def test_equality():
    diff_processor = matcher.processor()
    ops, a, b = diff_processor.process(rev1)
    ops, a, b = diff_processor.process(rev1 + " ")
-    assert len(ops) == 258
+    assert len(ops) == 257
    for op in ops[:-2]:
-        print(op)
        assert isinstance(op, Equal)

    # note that the whitespace token does not result in a token according to wikitext_split
@@ -152,44 +149,48 @@ def test_delete():
    assert_equal_enough(b, rev2)
    assert_equal_enough(a, rev1)

-    initial_equal_tokens = 0
    first_nondelete_token = None
    n_deletes = 0
    n_deleted_tokens = 0
-    last_b2 = initial_equal_tokens
+    initial_equal_lines = 256
+    initial_equal_tokens = 9911
+    for i, op in enumerate(ops):
+        if initial_equal_lines > 0:
+            assert isinstance(op, Equal)
+        else:
+            break
+        initial_equal_lines -= 1
+
+    assert initial_equal_lines == 0
+    assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens

-    initial_equal_lines = 4
-    initial_equal_tokens = 14
-    last_b2 = assert_correct_equal_section(ops,
-                                           expected_equal_lines=initial_equal_lines,
-                                           expected_equal_tokens=initial_equal_tokens)
    first_noninsert_token = initial_equal_tokens
-    

-    last_non_delete = False
+    last_delete = False
+    last_insert = False
    idx = 0
+    n_non_delete = 0
+
+    last_delete_idx = 0
    for op in ops[initial_equal_lines:]:
        idx += 1
-        # deletes are interleaved with Equal newlines.
-        if not isinstance(op, Delete):
-            if last_non_delete:
-                first_nondelete_token = op.a1
-                break
-            last_non_delete = True
-        else:
-            last_non_delete = False
-        if last_non_delete:
+        if isinstance(op, Delete):
            n_deletes += 1
-            n_deleted_tokens += op.a2 - last_b2
-            last_b2 = op.a2
-       
-    assert n_deletes == 2
-    assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
+            n_deleted_tokens += op.a2 - op.a1
+            last_delete = True
+            last_delete_idx = idx
+        # we need to add back a newline when we have a delete
+        else:
+            n_non_delete += 1
+            if not last_delete and first_nondelete_token is None:
+                first_nondelete_token = op.a1

-
-    last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
-                                           expected_equal_lines=252,
-                                           expected_equal_tokens=9765)
+        if n_non_delete:
+            last_b2 = op.b2
+            
+    assert n_deletes == 4
+    assert n_deleted_tokens == 320
+    assert idx == len(ops)


 # first lets test that we properly build the operations. 
@@ -204,14 +205,8 @@ def test_addition():
    # so they reflect the state of the text according to the diff processor
    ops, a, b = diff_processor.process(rev1)

-    even = True
    for op in ops:
-        if even:
-            assert isinstance(op, Insert)
-            even = False
-        else:
-            assert isinstance(op, Equal)
-            even = True
+        assert isinstance(op, Insert)

    assert_equal_enough(b, rev1)
    
@@ -221,8 +216,8 @@ def test_addition():
    assert_equal_enough(a, rev1)
    assert_equal_enough(b, rev2)
    ops = list(ops)
-    initial_equal_lines = 256
-    initial_equal_tokens = 9487
+    initial_equal_lines = 255
+    initial_equal_tokens = 9614
    last_b2 = assert_correct_equal_section(ops,
                                           expected_equal_lines=initial_equal_lines,
                                           expected_equal_tokens=initial_equal_tokens)
@@ -232,16 +227,21 @@ def test_addition():
    n_inserted_tokens = 0
    last_b2 = last_insert_b2 = initial_equal_tokens
    idx = 0
-    print(ops[initial_equal_lines:])
+
+    last_insert = False
    for op in ops[initial_equal_lines:]:
        if isinstance(op, Insert):
            n_inserts += 1
            n_inserted_tokens += op.b2 - op.b1
            last_insert_b2 = op.b2
+            last_insert = True
+        elif last_insert:
+            assert isinstance(op, Equal)
+        
        last_b2 = op.b2

-    assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
-    assert n_inserts == 2
+    assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296
+    assert n_inserts == 4

 def test_paragraph_move():
    rev1 = open("test/test_diff_revisions/1295229484").read()
@@ -269,6 +269,63 @@ def test_paragraph_move_and_change():
    assert_equal_enough(a, rev1)
    assert_equal_enough(b, rev2)

+def test_infobox():
+    rev1 = open("test/test_diff_revisions/test_infobox_from").read()
+    rev2 = open("test/test_diff_revisions/test_infobox_to").read()
+    matcher = WikiDiffMatcher([rev1,rev2])
+    diff_processor = matcher.processor()
+
+    # note that a and b are constructed from the diffs.
+    # so they reflect the state of the text according to the diff processor
+    ops, a, b = diff_processor.process(rev1)
+    ops, a, b = diff_processor.process(rev2)
+    assert_equal_enough(b, rev2)
+    assert_equal_enough(a, rev1)
+
+def test_leading_whitespace():
+    rev1 = open("test/test_diff_revisions/test_leading_ws_from").read()
+    rev2 = open("test/test_diff_revisions/test_leading_ws_to").read()
+    matcher = WikiDiffMatcher([rev1,rev2])
+    diff_processor = matcher.processor()
+
+    # note that a and b are constructed from the diffs.
+    # so they reflect the state of the text according to the diff processor
+    ops, a, b = diff_processor.process(rev1)
+    ops, a, b = diff_processor.process(rev2)
+    assert_equal_enough(b, rev2)
+    assert_equal_enough(a, rev1)
+
+# def test_whitespace_2():
+#     rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
+#     rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
+#     matcher = WikiDiffMatcher([rev1,rev2])
+#     diff_processor = matcher.processor()
+
+#     # note that a and b are constructed from the diffs.
+#     # so they reflect the state of the text according to the diff processor
+#     ops, a, b = diff_processor.process(rev1)
+#     ops, a, b = diff_processor.process(rev2)
+#     assert_equal_enough(b, rev2)
+#     assert_equal_enough(a, rev1)
+
+
+
+def test_actually_equal():
+    rev1 = open("test/test_diff_revisions/1285792388").read()
+    # whitespace is added because exact identity reverts do not result in diffs.
+    matcher = WikiDiffMatcher([rev1,rev1])
+    diff_processor = matcher.processor()
+    ops, a, b = diff_processor.process(rev1)
+    ops, a, b = diff_processor.process(rev1)
+    assert len(ops) == 1
+    assert isinstance(ops[0], Equal)
+
+    # note that the whitespace token does not result in a token according to wikitext_split
+    # compare the tokens based on the diffs to the baseline
+    # whitespace differences are allowed
+    assert_equal_enough(b, rev1)
+    assert_equal_enough(a, rev1)
+    
 # slow test
 def test_diff_consistency():
    from mwxml import Dump