almost there. working out edge cases.

2025-07-03 21:32:44 -07:00 · 2025-07-03 21:32:44 -07:00 · 4654911533
commit 4654911533
parent cf1fb61a84
3 changed files with 345 additions and 231 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,6 +12,7 @@ dependencies = [
    "mwtypes>=0.4.0",
    "mwxml>=0.3.6",
    "pyarrow>=20.0.0",
+    "sortedcontainers>=2.4.0",
    "yamlconf>=0.2.6",
 ]

@ -22,6 +23,7 @@ deltas = { git = "https://github.com/groceryheist/deltas" }

 [dependency-groups]
 dev = [
+    "ipython>=8.18.1",
    "pandas>=2.1.0",
    "pytest>=8.4.1",
    "pytest-asyncio>=1.0.0",
--- a/test/test_wiki_diff_matcher.py
+++ b/test/test_wiki_diff_matcher.py
@ -2,7 +2,7 @@
 import asyncio
 import subprocess
 from functools import partial
-
+import re
 import pytest
 import pytest_asyncio
 from typing import List
@ -30,11 +30,10 @@ async def start_stop_server():
 def assert_equal_enough(tokens:List[Token], rev):
    # the tokens exclude newlines
    # we allow extra whitespace at the beginning or end
-    token_doc = ''.join(str(t) for t in tokens).strip()
-    while '\n\n' in token_doc:
-        token_doc = token_doc.replace('\n\n','\n')
-    while '\n\n' in rev:
-        rev = rev.replace('\n\n','\n').strip()
+    token_doc = ''.join(str(t) for t in tokens)
+    token_doc = re.sub(r'\s+', ' ', token_doc).strip()
+    rev = re.sub(r'\s+', ' ', rev).strip()
+
    print(token_doc, file = open('token','w'))
    print(rev, file = open('rev','w'))
    assert token_doc == rev
@ -63,7 +62,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token
    if first_unequal_token is None:
        first_unequal_token = ops[-1].b2
    
-    
    assert n_equal_lines == expected_equal_lines
    # check that there are no gaps and the number is as expected
    assert initial_equal_tokens == last_b2 - ops[0].b1 == first_unequal_token - ops[0].b1 == expected_equal_tokens
@ -76,9 +74,8 @@ def test_equality():
    diff_processor = matcher.processor()
    ops, a, b = diff_processor.process(rev1)
    ops, a, b = diff_processor.process(rev1 + " ")
-    assert len(ops) == 258
+    assert len(ops) == 257
    for op in ops[:-2]:
-        print(op)
        assert isinstance(op, Equal)

    # note that the whitespace token does not result in a token according to wikitext_split
@ -152,44 +149,48 @@ def test_delete():
    assert_equal_enough(b, rev2)
    assert_equal_enough(a, rev1)

-    initial_equal_tokens = 0
    first_nondelete_token = None
    n_deletes = 0
    n_deleted_tokens = 0
-    last_b2 = initial_equal_tokens
+    initial_equal_lines = 256
+    initial_equal_tokens = 9911
+    for i, op in enumerate(ops):
+        if initial_equal_lines > 0:
+            assert isinstance(op, Equal)
+        else:
+            break
+        initial_equal_lines -= 1
+
+    assert initial_equal_lines == 0
+    assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens

-    initial_equal_lines = 4
-    initial_equal_tokens = 14
-    last_b2 = assert_correct_equal_section(ops,
-                                           expected_equal_lines=initial_equal_lines,
-                                           expected_equal_tokens=initial_equal_tokens)
    first_noninsert_token = initial_equal_tokens

-
-    last_non_delete = False
+    last_delete = False
+    last_insert = False
    idx = 0
+    n_non_delete = 0
+
+    last_delete_idx = 0
    for op in ops[initial_equal_lines:]:
        idx += 1
-        # deletes are interleaved with Equal newlines.
-        if not isinstance(op, Delete):
-            if last_non_delete:
-                first_nondelete_token = op.a1
-                break
-            last_non_delete = True
-        else:
-            last_non_delete = False
-        if last_non_delete:
+        if isinstance(op, Delete):
            n_deletes += 1
-            n_deleted_tokens += op.a2 - last_b2
-            last_b2 = op.a2
+            n_deleted_tokens += op.a2 - op.a1
+            last_delete = True
+            last_delete_idx = idx
+        # we need to add back a newline when we have a delete
+        else:
+            n_non_delete += 1
+            if not last_delete and first_nondelete_token is None:
+                first_nondelete_token = op.a1

-    assert n_deletes == 2
-    assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
+        if n_non_delete:
+            last_b2 = op.b2
            
-
-    last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
-                                           expected_equal_lines=252,
-                                           expected_equal_tokens=9765)
+    assert n_deletes == 4
+    assert n_deleted_tokens == 320
+    assert idx == len(ops)


 # first lets test that we properly build the operations. 
@ -204,14 +205,8 @@ def test_addition():
    # so they reflect the state of the text according to the diff processor
    ops, a, b = diff_processor.process(rev1)

-    even = True
    for op in ops:
-        if even:
-            assert isinstance(op, Insert)
-            even = False
-        else:
-            assert isinstance(op, Equal)
-            even = True
+        assert isinstance(op, Insert)

    assert_equal_enough(b, rev1)
    
@ -221,8 +216,8 @@ def test_addition():
    assert_equal_enough(a, rev1)
    assert_equal_enough(b, rev2)
    ops = list(ops)
-    initial_equal_lines = 256
-    initial_equal_tokens = 9487
+    initial_equal_lines = 255
+    initial_equal_tokens = 9614
    last_b2 = assert_correct_equal_section(ops,
                                           expected_equal_lines=initial_equal_lines,
                                           expected_equal_tokens=initial_equal_tokens)
@ -232,16 +227,21 @@ def test_addition():
    n_inserted_tokens = 0
    last_b2 = last_insert_b2 = initial_equal_tokens
    idx = 0
-    print(ops[initial_equal_lines:])
+
+    last_insert = False
    for op in ops[initial_equal_lines:]:
        if isinstance(op, Insert):
            n_inserts += 1
            n_inserted_tokens += op.b2 - op.b1
            last_insert_b2 = op.b2
+            last_insert = True
+        elif last_insert:
+            assert isinstance(op, Equal)
+        
        last_b2 = op.b2

-    assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
-    assert n_inserts == 2
+    assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296
+    assert n_inserts == 4

 def test_paragraph_move():
    rev1 = open("test/test_diff_revisions/1295229484").read()
@ -269,6 +269,63 @@ def test_paragraph_move_and_change():
    assert_equal_enough(a, rev1)
    assert_equal_enough(b, rev2)

+def test_infobox():
+    rev1 = open("test/test_diff_revisions/test_infobox_from").read()
+    rev2 = open("test/test_diff_revisions/test_infobox_to").read()
+    matcher = WikiDiffMatcher([rev1,rev2])
+    diff_processor = matcher.processor()
+
+    # note that a and b are constructed from the diffs.
+    # so they reflect the state of the text according to the diff processor
+    ops, a, b = diff_processor.process(rev1)
+    ops, a, b = diff_processor.process(rev2)
+    assert_equal_enough(b, rev2)
+    assert_equal_enough(a, rev1)
+
+def test_leading_whitespace():
+    rev1 = open("test/test_diff_revisions/test_leading_ws_from").read()
+    rev2 = open("test/test_diff_revisions/test_leading_ws_to").read()
+    matcher = WikiDiffMatcher([rev1,rev2])
+    diff_processor = matcher.processor()
+
+    # note that a and b are constructed from the diffs.
+    # so they reflect the state of the text according to the diff processor
+    ops, a, b = diff_processor.process(rev1)
+    ops, a, b = diff_processor.process(rev2)
+    assert_equal_enough(b, rev2)
+    assert_equal_enough(a, rev1)
+
+# def test_whitespace_2():
+#     rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
+#     rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
+#     matcher = WikiDiffMatcher([rev1,rev2])
+#     diff_processor = matcher.processor()
+
+#     # note that a and b are constructed from the diffs.
+#     # so they reflect the state of the text according to the diff processor
+#     ops, a, b = diff_processor.process(rev1)
+#     ops, a, b = diff_processor.process(rev2)
+#     assert_equal_enough(b, rev2)
+#     assert_equal_enough(a, rev1)
+
+
+
+def test_actually_equal():
+    rev1 = open("test/test_diff_revisions/1285792388").read()
+    # whitespace is added because exact identity reverts do not result in diffs.
+    matcher = WikiDiffMatcher([rev1,rev1])
+    diff_processor = matcher.processor()
+    ops, a, b = diff_processor.process(rev1)
+    ops, a, b = diff_processor.process(rev1)
+    assert len(ops) == 1
+    assert isinstance(ops[0], Equal)
+
+    # note that the whitespace token does not result in a token according to wikitext_split
+    # compare the tokens based on the diffs to the baseline
+    # whitespace differences are allowed
+    assert_equal_enough(b, rev1)
+    assert_equal_enough(a, rev1)
+    
 # slow test
 def test_diff_consistency():
    from mwxml import Dump
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@ -3,25 +3,37 @@ import sys
 from collections import namedtuple
 from itertools import chain
 from typing import Dict, Generator, List, Optional, Tuple
-
+from sortedcontainers import SortedDict
 import requests
-from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token,
-                    RegexTokenizer, tokenizers)
+from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
+                    RegexTokenizer, Token, tokenizers)

 TOKENIZER = tokenizers.wikitext_split

+# def find_greatest_le_key(target_key, data_dict):
+#     found_key = None
+#     for key in data_dict:  # Iterates over keys in insertion order (which is sorted)
+#         if key <= target_key:
+#             found_key = (
+#                 key  # This is the largest key found so far that satisfies the condition
+#             )
+#         else:
+#             # Since the dictionary is sorted, if key > target_key,
+#             # all subsequent keys will also be > target_key.
+#             return found_key or key
+
+# def find_smallest_gt_key(target_key, data_dict):
+#     found_key = None
+#     for key in reversed(data_dict):  # Iterates over keys in insertion order (which is sorted)
+#         if key >= target_key:
+#             found_key = (
+#                 key  # This is the largest key found so far that satisfies the condition
+#             )
+#         else:
+#             # Since the dictionary is sorted, if key > target_key,
+#             # all subsequent keys will also be > target_key.
+#             return found_key or key

-def find_greatest_le_key(target_key, data_dict):
-    found_key = None
-    for key in data_dict:  # Iterates over keys in insertion order (which is sorted)
-        if key <= target_key:
-            found_key = (
-                key  # This is the largest key found so far that satisfies the condition
-            )
-        else:
-            # Since the dictionary is sorted, if key > target_key,
-            # all subsequent keys will also be > target_key.
-            return found_key or key


 def compute_diffs(url: str, texts: list[str]) -> list:
@ -61,55 +73,94 @@ class DiffToOperationMap:
        self.diff = json.loads(diff)

        # the code below is designed to work in bytes because that's how wikidiff2 indexes
-        self.from_last_end_bytes = 0
-        self.from_last_to_bytes = 0
-        self.n_from_start_tokens = 0
-        self.n_from_end_tokens = 0
-        self.n_from_start_tokens = 0
-        self.n_to_start_tokens = 0
-        self.last_to_start_line = 0
-        self.last_from_start_line = 0
-        self.from_last_end_bytes = 0
-        self.to_last_end_bytes = 0
+        # self.from_last_end_bytes = 0
+        # self.from_last_to_bytes = 0
+        # self.n_from_start_tokens = 0
+        # self.n_from_end_tokens = 0
+        # self.n_from_start_tokens = 0
+        # self.n_to_start_tokens = 0
+        # self.from_last_end_bytes = 0
+        # self.to_last_end_bytes = 0
        # keeps track of the number of tokens seen so far
        # to avoid repeated tokenization
-        self.from_byte_token_index_map: Dict[int, int] = {}
-        self.to_byte_token_index_map: Dict[int, int] = {}
+        # self.from_byte_token_index_map: SortedDict[int, int] = SortedDict()
+        # self.to_byte_token_index_map: SortedDict[int, int] = SortedDict()
        self.par_move_dict = {}

        # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
-        self.to_linenumber_bytes_map = {}
+        self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
+        self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
+    # def get_token_offset(self, byte_offset):
+    #     from_token_start = None
+    #     to_token_start = None
+    #     from_last_end_bytes = self.from_byte_token_index_map.keys()[-1]
+    #     to_last_end_bytes = self.to_byte_token_index_map.keys()[-1]
+    #     if byte_offset['from'] is not None:
+    #         if byte_offset['from'] < self.from_byte_token_index_map.values()[0]:
+    #             from_token_start = 0
+    #         else:
+    #             key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from'])
+    #         # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below?
+    #         if key > from_last_end_bytes:
+    #             from_token_start = self.from_byte_token_index_map[from_last_end_bytes]
+    #         else:
+    #             from_token_
+    #     if byte_offset['to'] is not None:
+    #         if byte_offset['to'] < self.to_byte_token_index_map.values()[0]:
+    #             to_token_start = 0
+    #         else:
+    #             key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to'])
+    #         if key >= from
+    #         if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0:
+    #             if (
+    #                     byte_offset['from'] >= self.from_last_end_bytes
+    #             ):  # if the from paragraph is at the end
+    #                 from_token_start = next(
+    #                     reversed(self.from_byte_token_index_map.values())
+    #                 )
+    #             else:
+    #                 key = find_greatest_le_key(
+    #                 byte_offset['from'], self.from_byte_token_index_map
+    #                 )
+    #                 from_token_start = self.from_byte_token_index_map[key]
+    #         else:
+    #             from_token_start = 0
+
+    #     to_offset = None
+    #     if byte_offset['to'] is not None:
+    #         if len(self.to_byte_token_index_map) > 0:
+    #             if to_byte_start >= self.to_last_end_bytes:
+    #                 to_token_start = next(reversed(self.to_byte_token_index_map.values()))
+    #             else:
+    #                 key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map)
+    #                 to_token_start = self.to_byte_token_index_map[key]
+    #         else:
+    #             to_token_start = 0
+
+    #     return {'from': from_token_start,
+    #             'to': to_token_start}

    def tokenize(self, bytes):
        return self.tokenizer.tokenize(bytes.decode("utf-8"))

-    def newline_result(self):
-        self.n_from_end_tokens += 1
-        self.n_from_start_tokens += 1
-        self.n_to_end_tokens += 1
-        self.n_to_start_tokens +=1
-        
-        return (Equal(self.n_from_start_tokens - 1,
-                     self.n_from_end_tokens,
-                     self.n_to_start_tokens - 1,
-                     self.n_from_start_tokens),
-                [Token('\n')],
-                [Token('\n')])
-                     
-    
    def to_operations(self):
-        parmoves = []
-        [print(diff) for diff in self.diff["diff"][0:5]]
+
        for entry in self.diff["diff"]:
+
+            entry["text"] += "\n"
+            text = entry["text"]
            offset = entry["offset"]
+            if offset["from"] and entry.get("lineNumber") is not None :
+                if entry['type'] in [0, 2, 3, 4]:
+                    self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode())

            if offset["to"]:
-                self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"]
+                if entry['type'] in [0, 1, 3, 5]:
+                    self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode())
+
+
+            # add back the newline

-            text = entry["text"]
-            # ignore empty diffs. They don't have any tokens
-            if len(text) == 0:
-                continue
            # this is the first byte of the line in the 'from' revision.
            from_start_line = entry["offset"]["from"]
            # this is the first byte of the line in the 'to' revision.
@ -117,23 +168,19 @@ class DiffToOperationMap:
            
            if entry["type"] == 0:
                yield from self.doEqual(text, offset)
-                yield self.newline_result()

                # a line included in the 'to' revision, but not in the 'from' revision
            elif entry["type"] == 1:
                yield from self.doInsert(text, offset)
-                yield self.newline_result()

                # a line included in the 'from' revision, but not in the 'to' revision
            elif entry["type"] == 2:
                yield from self.doDelete(text, offset)
-                yield self.newline_result()                

            elif entry["type"] == 3:
                yield from self.doHighlightRange(
                    text, entry["highlightRanges"], offset, entry["lineNumber"]
                )
-                yield self.newline_result()

            elif entry["type"] == 4:
                self.par_move_dict[entry["moveInfo"]["id"]] = entry
@ -141,34 +188,37 @@ class DiffToOperationMap:
                linkId = entry["moveInfo"]["linkId"]
                if linkId in self.par_move_dict:
                    yield from self.doParMove(entry, self.par_move_dict[linkId])
-                    yield self.newline_result()
-                else:
-                    # we need to count the tokens in the from revision so token index is correct
-                    self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
-                    self.n_from_start_tokens += len(
-                        self.tokenize(entry["text"].encode())
-                    )
+
+                # we need to count the tokens in the from revision so token index is correct
+                # self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
+                # self.n_from_start_tokens += len(
+                #     self.tokenize(entry["text"].encode())
+                # )
 
            elif entry["type"] == 5:
                linkId = entry["moveInfo"]["linkId"]
                if linkId in self.par_move_dict:
                    yield from self.doParMove(self.par_move_dict[linkId], entry)
-                    yield self.newline_result()
                else:
                    self.par_move_dict[entry["moveInfo"]["id"]] = entry
                    # call doHighlightRange just to update the token indices
-                    offset = {
-                        "from": self.n_from_end_tokens,
-                        "to": entry["offset"]["to"],
-                    }
-                    res = self.doHighlightRange(
-                        entry["text"],
-                        entry["highlightRanges"],
-                        offset,
-                        entry["lineNumber"],
-                        update_idx="to",
-                    )
-                    list(res)
+                    # offset = {
+                    #     "from": self.n_from_end_tokens,
+                    #     "to": entry["offset"]["to"],
+                    # }
+                    # res = self.doHighlightRange(
+                    #     entry["text"],
+                    #     entry["highlightRanges"],
+                    #     offset,
+                    #     entry["lineNumber"],
+                    #     update_idx="to",
+                    # )
+                    # list(res)
+                # self.n_to_end_tokens += len(self.tokenize(entry["text"].encode()))
+                # self.n_to_start_tokens += len(
+                #     self.tokenize(entry["text"].encode())
+                # )
+
            else:
                # The 'type' isn't one of the known
                raise ValueError(d)
@ -180,99 +230,100 @@ class DiffToOperationMap:
        # strictly increasing, while the "from" segments should merely be
        # non-overlapping.

-    def doEqual(self, equal_segment, offset, update_idx="all", type=str):
-        if type is str:
+    def doEqual(self, equal_segment, offset, update_idx="all"):
+        # if from_token_start is None:
+        #     from_token_start = self.n_from_start_tokens
+        # if to_token_start is None:
+        #     to_token_start = self.n_to_start_tokens
+
+        if isinstance(equal_segment, str):
            equal_bytes = equal_segment.encode()
-        elif type is bytes:
+        elif isinstance(equal_segment, bytes):
            equal_bytes = equal_segment
        else:
            raise ValueError(equal_segment)

        tokens = self.tokenize(equal_bytes)
        n_tokens = len(tokens)
-        n_from_end_tokens = self.n_from_start_tokens + n_tokens
-        n_to_end_tokens = self.n_to_start_tokens + n_tokens
-        # we need to keep track of the to and from last end bytes
-        self.from_last_end_bytes = offset["from"] + len(equal_bytes)
-        self.to_last_end_bytes = offset["to"] + len(equal_bytes)
+
+        # token_offset = self.get_token_offset(offset)
+
+        # n_from_end_tokens = token_offset['from'] + n_tokens
+        # n_to_end_tokens = token_offset['to'] + n_tokens
+
        yield (
            Equal(
-                self.n_from_start_tokens,
-                n_from_end_tokens,
-                self.n_to_start_tokens,
-                n_to_end_tokens,
+                offset['from'],
+                None,
+                offset['to'],
+                None,
            ),
            tokens,
            tokens,
        )

-        if update_idx in ["from", "all"]:
-            self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
+        # if update_idx in ["from", "all"]:
+        #     self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens

-        if update_idx in ["to", "all"]:
-            self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
+        # if update_idx in ["to", "all"]:
+        #     self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens

-        self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
-        self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
+        # self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
+        # self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens

-    def doInsert(self, insert_segment, offset, update_idx="all", type=str):
-        if type is str:
+    def doInsert(self, insert_segment, offset, update_idx="all"):
+        if isinstance(insert_segment, str):
            insert_bytes = insert_segment.encode()
-        elif type is bytes:
+        elif isinstance(insert_segment, bytes):
            insert_bytes = insert_segment
        else:
            raise ValueError(insert_segment)
        tokens = self.tokenize(insert_bytes)
-        n_tokens = len(tokens)
-        n_to_end_tokens = self.n_to_start_tokens + n_tokens
-        self.to_last_end_bytes = offset["to"] + len(insert_bytes)
+        # n_tokens = len(tokens)
+        # token_offset = self.get_token_offset(offset)
+        # n_to_end_tokens = token_offset['to'] + n_tokens
        yield (
            Insert(
-                self.n_from_start_tokens,
-                self.n_from_start_tokens,
-                self.n_to_start_tokens,
-                n_to_end_tokens,
+                None,
+                None,
+                offset['to'],
+                None,
            ),
            [],
            tokens,
        )
        # We have now used more of the "to" tokens.
-        if update_idx in ["to", "all"]:
-            self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
-
-        self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
+        #self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens

    def doDelete(self, delete_segment, offset, update_idx="all", type=str):
-        if type is str:
+        if isinstance(delete_segment, str):
            delete_bytes = delete_segment.encode()
-        elif type is bytes:
+        elif isinstance(delete_segment, bytes):
            delete_bytes = delete_segment
        else:
            raise ValueError(delete_segment)
        tokens = self.tokenize(delete_bytes)
-        n_tokens = len(tokens)
+        # n_tokens = len(tokens)
+
+        # token_offset = self.get_token_offset(offset)
+        # n_from_end_tokens = token_offset['from'] + n_tokens

-        n_from_end_tokens = self.n_from_start_tokens + n_tokens
-        self.from_last_end_bytes = offset["from"] + len(delete_bytes)
        yield (
            Delete(
-                self.n_from_start_tokens,
-                n_from_end_tokens,
-                self.n_to_start_tokens,
-                self.n_to_start_tokens,
+                offset['from'],
+                None,
+                None,
+                None
            ),
            tokens,
            [],
        )
-        # We have now used more of the "from" tokens.
-        if update_idx in ["from", "all"]:
-            self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens

-        self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
+        #self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens

    def doHighlightRange(
-        self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"
-    ):
+            self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"):
+
        # The text field is an overlapping mix of both the from and to,
        # so we need to handle it highlight-by-highlight.
        # there can be gaps between highlight segments.
@ -283,11 +334,15 @@ class DiffToOperationMap:

        # it's possible for offset['to'] to be null.
        # we can get it from the line number?
-
+        update_linenumber_map = True
        if offset["to"] is None:
-            offset["to"] = self.from_byte_token_index_map[
-                find_greatest_le_key(lineNumber, self.from_byte_token_index_map)
-            ]
+            keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
+            if keyidx > 0:
+                print(self.to_linenumber_bytes_map)
+                key = self.to_linenumber_bytes_map.keys()[keyidx]
+                offset["to"] = self.to_linenumber_bytes_map[key]
+            else:
+                offset["to"] = 0

        highlight_offset = offset
        # note that diffs are token-level, but the indexes are byte-level
@ -299,10 +354,12 @@ class DiffToOperationMap:
                equal_bytes = highlight_bytes[highlight_end:highlight_start]
                n_equal_bytes = len(equal_bytes)
                yield from self.doEqual(
-                    equal_bytes, highlight_offset, update_idx=update_idx, type=bytes
+                    equal_bytes, highlight_offset, update_idx=update_idx
                )
                highlight_offset["from"] += n_equal_bytes
                highlight_offset["to"] += n_equal_bytes
+                if update_linenumber_map:
+                    self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']

            # handle highlighted insert / delete
            highlight_end = highlight_start + highlightRange["length"]
@ -311,12 +368,14 @@ class DiffToOperationMap:

            if highlightRange["type"] == 0:
                yield from self.doInsert(
-                    range_bytes, highlight_offset, update_idx=update_idx, type=bytes
+                    range_bytes, highlight_offset, update_idx=update_idx
                )
                highlight_offset["to"] += n_range_bytes
+                if update_linenumber_map:
+                    self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
            elif highlightRange["type"] == 1:
                yield from self.doDelete(
-                    range_bytes, highlight_offset, update_idx=update_idx, type=bytes
+                    range_bytes, highlight_offset, update_idx=update_idx
                )
                highlight_offset["from"] += n_range_bytes
            else:
@ -325,46 +384,14 @@ class DiffToOperationMap:
        # handle the rest of the line which is equal
        if highlight_end < len(highlight_bytes):
            range_bytes = highlight_bytes[highlight_end:]
-            yield from self.doEqual(range_bytes, highlight_offset, type=bytes)
+            yield from self.doEqual(range_bytes, highlight_offset)

    def doParMove(self, from_diff, to_diff):
        # the tricky part here is to put the tokens in the right spots.
        from_byte_start = from_diff["offset"]["from"]
-        # as of python 3.7 dictionaries are in insertion order. So
-        # we can just find the first key that's greater
-
-        # since the paragraph is removed in the "from" version, the index it is removed from
-        # will be *after* the
-        if len(self.from_byte_token_index_map) > 0:
-            if (
-                from_byte_start >= self.from_last_end_bytes
-            ):  # if the from paragraph is at the end
-                from_token_start = next(
-                    reversed(self.from_byte_token_index_map.values())
-                )
-            else:
-                key = find_greatest_le_key(
-                    from_byte_start, self.from_byte_token_index_map
-                )
-                from_token_start = self.from_byte_token_index_map[key]
-        else:
-            from_token_start = 0
-
-        if len(self.to_byte_token_index_map) > 0:
-            # get the to token index
-            to_byte_start = to_diff["offset"]["to"]
-            if to_byte_start >= self.to_last_end_bytes:
-                to_token_start = next(reversed(self.to_byte_token_index_map.values()))
-            else:
-                key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map)
-                to_token_start = self.to_byte_token_index_map[key]
-        else:
-            to_token_start = 0
-
-        # now we set the state and apply the highlights
-        self.n_from_start_tokens = self.n_from_end_tokens = from_token_start
-        self.n_to_start_tokens = self.n_to_end_tokens = to_token_start
+        to_byte_start = to_diff["offset"]["to"]
        offset = {"from": from_byte_start, "to": to_byte_start}
+        # we need to cache the indexes; replace them; then restore
        yield from self.doHighlightRange(
            to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
        )
@ -397,35 +424,63 @@ class WikiDiffMatcher:
            diff = next(self.diffs)
            diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)

-            diffops = list(zip(*diffToOperationsMapper.to_operations()))
+            diffops = list(diffToOperationsMapper.to_operations())

-            if not diffops:
-                self.last_tokens = []
-                return [], [], []
+            # this happens when revisions are actually equal.
+            if len(diffops) == 0:
+                self.last_tokens = self.tokenizer.tokenize(text)
+                ops = [Equal(0, len(self.last_tokens),
+                             0, len(self.last_tokens))]
+                return ops, self.last_tokens, self.last_tokens

-            diffops = (
-                operations,
-                aseq,
-                bseq,
-            ) = diffops
+            # we get back the byte indices; now we transform to token indices

-            aseq = list(aseq)
+            diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1))
+            aorder_ops = []            
+            token_offset = 0
+            _, aseq, _ = list(zip( * diffops))

-            # aseq/bseq can be out of order, we need to sort it by a1/b1 index.
-            indices = list(range(len(aseq)))
-            indices.sort(key=lambda i: operations[i].a1)
-            aseq = [aseq[i] for i in indices]
+            for op, tokens, _ in diffops:
+                a1 = token_offset
+                if isinstance(op, Equal) or isinstance(op, Delete):
+                    token_offset += len(tokens)
+                    a2 = token_offset
+                    aorder_ops.append(type(op)(a1,
+                                               a2,
+                                               op.b1,
+                                               op.b1))
+                else:
+                    aorder_ops.append(Insert(a1,
+                                             a1,
+                                             op.b1,
+                                             op.b1))

-            bseq = list(bseq)
-            indices = list(range(len(bseq)))
-            indices.sort(key=lambda i: operations[i].b1)
-            bseq = [bseq[i] for i in indices]
+            _, aseq, bseq = zip(* diffops)
+            diffops = list(zip(aorder_ops, aseq, bseq))
+            diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1))
+            _, _, bseq = list(zip(* diffops))
+            border_ops = []
+            token_offset = 0
+            for op, _, tokens in diffops:
+                b1 = token_offset
+                if isinstance(op, Equal) or isinstance(op, Insert):
+                    token_offset += len(tokens)
+                    b2 = token_offset
+                    border_ops.append(type(op)(op.a1,
+                                               op.a2,
+                                               b1,
+                                               b2))
+                else:
+                    border_ops.append(type(op)(op.a1,
+                                               op.a2,
+                                               b1,
+                                               b1))
+            
+            self.previous_text = text

            self.last_tokens = list(chain.from_iterable(aseq))
            tokens = list(chain.from_iterable(bseq))
-            self.previous_text = text
-
-            return operations, self.last_tokens, tokens
+            return border_ops, self.last_tokens, tokens

    def processor(self, *args, **kwargs):
        return self.Processor(self.diffs, self.tokenizer)