From 186cb82fb8d82b4f8817676fe87b1498e88891e5 Mon Sep 17 00:00:00 2001
From: Nathan TeBlunthuis <nathanteblunthuis@gmail.com>
Date: Fri, 27 Jun 2025 07:13:41 -0700
Subject: [PATCH] some work on wiki_diff_matcher.py

---
 wiki_diff_matcher.py | 159 ++++++++++++++++++++++++++-----------------
 1 file changed, 96 insertions(+), 63 deletions(-)

diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py
index 77c00ae..f930b2c 100644
--- a/wiki_diff_matcher.py
+++ b/wiki_diff_matcher.py
@@ -4,7 +4,8 @@ import sys
 import requests
 from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
 
-TOKENIZER = tokenizers.text_split
+TOKENIZER = tokenizers.wikitext_split
+
 
 def compute_diffs(url: str, texts: list[str]) -> list:
     response = None
@@ -36,14 +37,22 @@ def compute_diffs(url: str, texts: list[str]) -> list:
     return incremental_diffs
 
 
-def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list:
+def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
     d = json.loads(diff)
 
+    # the code below is designed to work in bytes because that's how wikidiff2 indexes
+    from_text = from_text.encode('utf-8')
+    to_text = to_text.encode('utf-8')
+
+    # convinient function for tokenizing bytes
+    def tokenize(bytes):
+        return tokenizer.tokenize(bytes.decode('utf-8'))
+
     # Keep track of the last difference we saw in order to notice unaccounted-for
     # tokens. Each token at the end of "to" which is skipped for the next diff
     # must be represented as an "Equal()" segment.
-    from_last_end = 0
-    to_last_end = 0
+    from_last_end_bytes = 0
+    to_last_end_bytes = 0
 
     result = []
     # DiffState expects differences to be represented in order from the
@@ -52,56 +61,61 @@ def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) ->
     # As a rule of thumb, the "to" segments should be non-overlapping and
     # strictly increasing, while the "from" segments should merely be
     # non-overlapping.
-    #
+    
     # wikidiff2 appears to follow this same convention, but this behavior
     # is not documented.
 
-    for entry in d['diff']:
-        from_start_line = entry['offset']['from']
-        to_start_line = entry['offset']['to']
-        # Per above, to_start_line appears to be nondecreasing, but
-        # from_start_line may sometimes decrease for detected paragraph moves.
+    # Note that, confusingly for Insert operations only the "to" indexes matter
+    # and for the Delete and Equal operations only the "from" indexes matter.
+    # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`
 
-        from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line]))
-        to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line]))
+    parmove_from_dict = {} # lookup move diffs based on moveinfo id.
+    parmove_to_dict = {} 
+    
+    for entry in d['diff']:
+        linebytes = entry['text'].encode('utf-8')
+        from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
+        to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.
+
+        from_start_tokens = len(tokenize(from_text[:from_start_line]))
+        to_start_tokens = len(tokenize(to_text[:to_start_line]))
         # These constant calls to tokenizer.tokenize can definitely be optimized
         # as tokenization is currently a bottleneck. Ideally tokenization would
         # happen incrementally where possible, or somehow be cached, but this
-        # would be more complex.
+        # would be more complex. N: I think it's okay. CPU is cheap.
 
-        if entry['type'] == 0:
-            # wikidiff2 doesn't appear to emit diffs of this type, but cover anyway.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
+        if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.
+
+            line_tokens = len(tokenize(linebytes))
             from_end_tokens = from_start_tokens + line_tokens
             to_end_tokens = to_start_tokens + line_tokens
 
             result.append(Equal(from_start_tokens, from_end_tokens,
                                 to_start_tokens, to_end_tokens))
 
-            from_last_end = from_end_tokens
-            to_last_end  = to_end_tokens
+            # we need to keep track of the to and from last end bytes
+            from_last_end_bytes += len(linebytes)
+            to_last_end_bytes  += len(linebytes)
 
             continue
         else:
             # These do not appear to be generated by wikidiff2, and so must be
             # inferred.
-            equal_tokens = to_start_tokens - to_last_end
+            equal_tokens = to_start_tokens - to_last_end_bytes
             # If we notice that the next non-zero segment (which must be a
             # change, given that its type is non-zero), begins after the end
             # of the previous segment, we must add an Equal segment.
-            # TODO: While the "to" token ranges are correct, the "from"
+            # TODO: While the "to" token ranges are correct,
+            # the "from"
             #  ranges are likely not, particularly in histories with paragraph
-            #  moves.
+            #  moves. they can be corrected.
             if equal_tokens > 0:
-                result.append(Equal(from_last_end, from_start_line,
-                                    to_last_end, to_start_line))
+                # only the 'from' indexes matter
+                result.append(Equal(from_last_end_bytes, from_start_line,
+                                    to_last_end_bytes, to_start_line))
 
-
-        if entry['type'] == 1 or entry['type'] == 4:
-            # TODO: Separate out type 4 to recognize this is the insertion
-            #  part of a paragraph move. Note that for paragraph moves
-            #  the text is not necessarily identical, just similar.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
+        if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
+            line_tokens = len(tokenize(linebytes))
             to_end_tokens = to_start_tokens + line_tokens
 
             result.append(Insert(from_start_tokens, from_start_tokens,
@@ -109,64 +123,83 @@ def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) ->
                                  ))
 
             # We have now used more of the "to" tokens.
-            to_last_end = to_end_tokens
-        elif entry['type'] == 2 or entry['type'] == 5:
-            # TODO: Separate out type 5 to recognize this is the deletion
-            #  part of a paragraph move. Note that for paragraph moves
-            #  the text is not necessarily identical, just similar.
-            line_tokens = len(tokenizer.tokenize(entry['text']))
+            to_start_end = to_end_tokens
+            
+        elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
+            line_tokens = len(tokenize(linebytes))
             from_end_tokens = from_start_tokens + line_tokens
 
             result.append(Delete(from_start_tokens, from_end_tokens,
                                  to_start_tokens, to_start_tokens,
                                  ))
 
-            # We have not used more of the "from" tokens.
-            from_last_end = from_end_tokens
+            # We have now used more of the "from" tokens.
+            from_last_end_bytes = from_end_tokens
+            
         elif entry['type'] == 3:
-            # The text field is an overlapping mix of both the previous and next
-            # lines, and so we can't directly tokenize it.
-
-            text = entry['text']
-
-            last_end = 0
-            previous_line = ""
-            next_line = ""
-
-            # A line will have one or more highlightRanges.
-            # It is not guaranteed that insertions/deletions are matched,
+            # The text field is an overlapping mix of both the from and to,
+            # so we need to handle it highlight-by-highlight.
+            # there can be gaps between highlight segments.
             # for instance, if a word is deleted from the middle of a line.
-            for highlightRange in entry['highlightRanges']:
-                if highlightRange['start'] > last_end:
-                    previous_line += text[last_end:highlightRange['start']]
-                    next_line += text[last_end:highlightRange['start']]
-                    # Add an Equal segment.
+            # we need to track that. 
+            highlight_last_end = 0
 
+            # note that diffs are token-level, but the indexes are byte-level
+            for highlightRange in entry['highlightRanges']:
+                if highlightRange['start'] > highlight_last_end:
+
+                    equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
+                    equal_tokens = len(tokenize(equal_bytes))
+                    from_end_tokens = from_start_tokens + equal_tokens
+                    to_end_tokens = to_end_tokens + equal_tokens
+                    result.append(Equal(from_start_tokens, from_end_tokens,
+                                        to_start_tokens, to_end_tokens
+                                        ))
+
+                    from_start_tokens = from_end_tokens
+                    to_start_tokens = to_end_tokens
+                    
                 rangeStart = highlightRange['start']
                 rangeEnd = rangeStart + highlightRange['length']
-
+                range_bytes = linebytes[rangeStart:rangeEnd]
+                range_tokens = len(tokenize(range_bytes))
                 if highlightRange['type'] == 0:
                     # Insertion
-                    next_line += text[rangeStart:rangeEnd]
+                    to_end_tokens = to_start_tokens + range_tokens
+                    result.append(Insert(from_start_tokens, from_end_tokens,
+                                         to_start_tokens, to_end_tokens))
 
-                    # Add an Insert segment.
+                    to_start_tokens = to_end_tokens
                 elif highlightRange['type'] == 1:
                     # Deletion
-                    previous_line += text[rangeStart:rangeEnd]
-
-                    # Add a Delete segment.
+                    from_end_tokens = from_start_tokens + range_tokens
+                    result.append(Delete(from_start_tokens, from_end_tokens,
+                                         to_start_tokens, to_end_tokens))
+                    from_start_tokens = from_end_tokens
+                                         
                 else:
                     raise Exception(entry)
+                
+                highlight_last_end = highlightRange['start'] + highlightRange['length']
 
-            from_tokens = len(tokenizer.tokenize(previous_line))
-            to_tokens = len(tokenizer.tokenize(next_line))
+        elif entry['type'] == 4:
 
-            from_start_tokens += from_tokens
-            to_start_tokens += to_tokens
+            parmove_from_dict['moveInfo']['id'] = diff
+
+        elif entry['type'] == 5:
+
+            parmove_to_dict['moveInfo']['id'] = diff
+            # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
+            # for deletions and equality report the token indexes from the 'from' revision. 
         else:
             # The 'type' isn't one of the known
             raise ValueError(d)
 
+    # now we go through the parmoves
+    for id, from_diff in parmove_from_dict.items():
+        to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
+        ### TODO calculate the correct token indexes.
+
     # TODO: Handle trailing tokens
 
     # raise Exception(result)