diff --git a/pyproject.toml b/pyproject.toml index c013ed0..4226341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "mwtypes>=0.4.0", "mwxml>=0.3.6", "pyarrow>=20.0.0", + "sortedcontainers>=2.4.0", "yamlconf>=0.2.6", ] @@ -22,6 +23,7 @@ deltas = { git = "https://github.com/groceryheist/deltas" } [dependency-groups] dev = [ + "ipython>=8.18.1", "pandas>=2.1.0", "pytest>=8.4.1", "pytest-asyncio>=1.0.0", diff --git a/test/test_wiki_diff_matcher.py b/test/test_wiki_diff_matcher.py index 8e0eabd..97d8140 100644 --- a/test/test_wiki_diff_matcher.py +++ b/test/test_wiki_diff_matcher.py @@ -2,7 +2,7 @@ import asyncio import subprocess from functools import partial - +import re import pytest import pytest_asyncio from typing import List @@ -30,11 +30,10 @@ async def start_stop_server(): def assert_equal_enough(tokens:List[Token], rev): # the tokens exclude newlines # we allow extra whitespace at the beginning or end - token_doc = ''.join(str(t) for t in tokens).strip() - while '\n\n' in token_doc: - token_doc = token_doc.replace('\n\n','\n') - while '\n\n' in rev: - rev = rev.replace('\n\n','\n').strip() + token_doc = ''.join(str(t) for t in tokens) + token_doc = re.sub(r'\s+', ' ', token_doc).strip() + rev = re.sub(r'\s+', ' ', rev).strip() + print(token_doc, file = open('token','w')) print(rev, file = open('rev','w')) assert token_doc == rev @@ -62,7 +61,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token # if the last line is an equal if first_unequal_token is None: first_unequal_token = ops[-1].b2 - assert n_equal_lines == expected_equal_lines # check that there are no gaps and the number is as expected @@ -76,9 +74,8 @@ def test_equality(): diff_processor = matcher.processor() ops, a, b = diff_processor.process(rev1) ops, a, b = diff_processor.process(rev1 + " ") - assert len(ops) == 258 + assert len(ops) == 257 for op in ops[:-2]: - print(op) assert isinstance(op, Equal) # note that the whitespace token does not result in a token according to wikitext_split @@ -152,44 +149,48 @@ def test_delete(): assert_equal_enough(b, rev2) assert_equal_enough(a, rev1) - initial_equal_tokens = 0 first_nondelete_token = None n_deletes = 0 n_deleted_tokens = 0 - last_b2 = initial_equal_tokens + initial_equal_lines = 256 + initial_equal_tokens = 9911 + for i, op in enumerate(ops): + if initial_equal_lines > 0: + assert isinstance(op, Equal) + else: + break + initial_equal_lines -= 1 + + assert initial_equal_lines == 0 + assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens - initial_equal_lines = 4 - initial_equal_tokens = 14 - last_b2 = assert_correct_equal_section(ops, - expected_equal_lines=initial_equal_lines, - expected_equal_tokens=initial_equal_tokens) first_noninsert_token = initial_equal_tokens - - last_non_delete = False + last_delete = False + last_insert = False idx = 0 + n_non_delete = 0 + + last_delete_idx = 0 for op in ops[initial_equal_lines:]: idx += 1 - # deletes are interleaved with Equal newlines. - if not isinstance(op, Delete): - if last_non_delete: - first_nondelete_token = op.a1 - break - last_non_delete = True - else: - last_non_delete = False - if last_non_delete: + if isinstance(op, Delete): n_deletes += 1 - n_deleted_tokens += op.a2 - last_b2 - last_b2 = op.a2 - - assert n_deletes == 2 - assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317 + n_deleted_tokens += op.a2 - op.a1 + last_delete = True + last_delete_idx = idx + # we need to add back a newline when we have a delete + else: + n_non_delete += 1 + if not last_delete and first_nondelete_token is None: + first_nondelete_token = op.a1 - - last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:], - expected_equal_lines=252, - expected_equal_tokens=9765) + if n_non_delete: + last_b2 = op.b2 + + assert n_deletes == 4 + assert n_deleted_tokens == 320 + assert idx == len(ops) # first lets test that we properly build the operations. @@ -204,14 +205,8 @@ def test_addition(): # so they reflect the state of the text according to the diff processor ops, a, b = diff_processor.process(rev1) - even = True for op in ops: - if even: - assert isinstance(op, Insert) - even = False - else: - assert isinstance(op, Equal) - even = True + assert isinstance(op, Insert) assert_equal_enough(b, rev1) @@ -221,8 +216,8 @@ def test_addition(): assert_equal_enough(a, rev1) assert_equal_enough(b, rev2) ops = list(ops) - initial_equal_lines = 256 - initial_equal_tokens = 9487 + initial_equal_lines = 255 + initial_equal_tokens = 9614 last_b2 = assert_correct_equal_section(ops, expected_equal_lines=initial_equal_lines, expected_equal_tokens=initial_equal_tokens) @@ -232,16 +227,21 @@ def test_addition(): n_inserted_tokens = 0 last_b2 = last_insert_b2 = initial_equal_tokens idx = 0 - print(ops[initial_equal_lines:]) + + last_insert = False for op in ops[initial_equal_lines:]: if isinstance(op, Insert): n_inserts += 1 n_inserted_tokens += op.b2 - op.b1 last_insert_b2 = op.b2 + last_insert = True + elif last_insert: + assert isinstance(op, Equal) + last_b2 = op.b2 - assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293 - assert n_inserts == 2 + assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296 + assert n_inserts == 4 def test_paragraph_move(): rev1 = open("test/test_diff_revisions/1295229484").read() @@ -269,6 +269,63 @@ def test_paragraph_move_and_change(): assert_equal_enough(a, rev1) assert_equal_enough(b, rev2) +def test_infobox(): + rev1 = open("test/test_diff_revisions/test_infobox_from").read() + rev2 = open("test/test_diff_revisions/test_infobox_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + + # note that a and b are constructed from the diffs. + # so they reflect the state of the text according to the diff processor + ops, a, b = diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(b, rev2) + assert_equal_enough(a, rev1) + +def test_leading_whitespace(): + rev1 = open("test/test_diff_revisions/test_leading_ws_from").read() + rev2 = open("test/test_diff_revisions/test_leading_ws_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + + # note that a and b are constructed from the diffs. + # so they reflect the state of the text according to the diff processor + ops, a, b = diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(b, rev2) + assert_equal_enough(a, rev1) + +# def test_whitespace_2(): +# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read() +# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read() +# matcher = WikiDiffMatcher([rev1,rev2]) +# diff_processor = matcher.processor() + +# # note that a and b are constructed from the diffs. +# # so they reflect the state of the text according to the diff processor +# ops, a, b = diff_processor.process(rev1) +# ops, a, b = diff_processor.process(rev2) +# assert_equal_enough(b, rev2) +# assert_equal_enough(a, rev1) + + + +def test_actually_equal(): + rev1 = open("test/test_diff_revisions/1285792388").read() + # whitespace is added because exact identity reverts do not result in diffs. + matcher = WikiDiffMatcher([rev1,rev1]) + diff_processor = matcher.processor() + ops, a, b = diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev1) + assert len(ops) == 1 + assert isinstance(ops[0], Equal) + + # note that the whitespace token does not result in a token according to wikitext_split + # compare the tokens based on the diffs to the baseline + # whitespace differences are allowed + assert_equal_enough(b, rev1) + assert_equal_enough(a, rev1) + # slow test def test_diff_consistency(): from mwxml import Dump diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py index b7812dc..8c5aa68 100644 --- a/wiki_diff_matcher.py +++ b/wiki_diff_matcher.py @@ -3,25 +3,37 @@ import sys from collections import namedtuple from itertools import chain from typing import Dict, Generator, List, Optional, Tuple - +from sortedcontainers import SortedDict import requests -from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token, - RegexTokenizer, tokenizers) +from deltas import (Delete, DiffEngine, Equal, Insert, Operation, + RegexTokenizer, Token, tokenizers) TOKENIZER = tokenizers.wikitext_split +# def find_greatest_le_key(target_key, data_dict): +# found_key = None +# for key in data_dict: # Iterates over keys in insertion order (which is sorted) +# if key <= target_key: +# found_key = ( +# key # This is the largest key found so far that satisfies the condition +# ) +# else: +# # Since the dictionary is sorted, if key > target_key, +# # all subsequent keys will also be > target_key. +# return found_key or key + +# def find_smallest_gt_key(target_key, data_dict): +# found_key = None +# for key in reversed(data_dict): # Iterates over keys in insertion order (which is sorted) +# if key >= target_key: +# found_key = ( +# key # This is the largest key found so far that satisfies the condition +# ) +# else: +# # Since the dictionary is sorted, if key > target_key, +# # all subsequent keys will also be > target_key. +# return found_key or key -def find_greatest_le_key(target_key, data_dict): - found_key = None - for key in data_dict: # Iterates over keys in insertion order (which is sorted) - if key <= target_key: - found_key = ( - key # This is the largest key found so far that satisfies the condition - ) - else: - # Since the dictionary is sorted, if key > target_key, - # all subsequent keys will also be > target_key. - return found_key or key def compute_diffs(url: str, texts: list[str]) -> list: @@ -61,79 +73,114 @@ class DiffToOperationMap: self.diff = json.loads(diff) # the code below is designed to work in bytes because that's how wikidiff2 indexes - self.from_last_end_bytes = 0 - self.from_last_to_bytes = 0 - self.n_from_start_tokens = 0 - self.n_from_end_tokens = 0 - self.n_from_start_tokens = 0 - self.n_to_start_tokens = 0 - self.last_to_start_line = 0 - self.last_from_start_line = 0 - self.from_last_end_bytes = 0 - self.to_last_end_bytes = 0 + # self.from_last_end_bytes = 0 + # self.from_last_to_bytes = 0 + # self.n_from_start_tokens = 0 + # self.n_from_end_tokens = 0 + # self.n_from_start_tokens = 0 + # self.n_to_start_tokens = 0 + # self.from_last_end_bytes = 0 + # self.to_last_end_bytes = 0 # keeps track of the number of tokens seen so far # to avoid repeated tokenization - self.from_byte_token_index_map: Dict[int, int] = {} - self.to_byte_token_index_map: Dict[int, int] = {} + # self.from_byte_token_index_map: SortedDict[int, int] = SortedDict() + # self.to_byte_token_index_map: SortedDict[int, int] = SortedDict() self.par_move_dict = {} # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. - self.to_linenumber_bytes_map = {} + self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() + self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() + # def get_token_offset(self, byte_offset): + # from_token_start = None + # to_token_start = None + # from_last_end_bytes = self.from_byte_token_index_map.keys()[-1] + # to_last_end_bytes = self.to_byte_token_index_map.keys()[-1] + # if byte_offset['from'] is not None: + # if byte_offset['from'] < self.from_byte_token_index_map.values()[0]: + # from_token_start = 0 + # else: + # key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from']) + # # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below? + # if key > from_last_end_bytes: + # from_token_start = self.from_byte_token_index_map[from_last_end_bytes] + # else: + # from_token_ + # if byte_offset['to'] is not None: + # if byte_offset['to'] < self.to_byte_token_index_map.values()[0]: + # to_token_start = 0 + # else: + # key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to']) + # if key >= from + # if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0: + # if ( + # byte_offset['from'] >= self.from_last_end_bytes + # ): # if the from paragraph is at the end + # from_token_start = next( + # reversed(self.from_byte_token_index_map.values()) + # ) + # else: + # key = find_greatest_le_key( + # byte_offset['from'], self.from_byte_token_index_map + # ) + # from_token_start = self.from_byte_token_index_map[key] + # else: + # from_token_start = 0 + + # to_offset = None + # if byte_offset['to'] is not None: + # if len(self.to_byte_token_index_map) > 0: + # if to_byte_start >= self.to_last_end_bytes: + # to_token_start = next(reversed(self.to_byte_token_index_map.values())) + # else: + # key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map) + # to_token_start = self.to_byte_token_index_map[key] + # else: + # to_token_start = 0 + + # return {'from': from_token_start, + # 'to': to_token_start} def tokenize(self, bytes): return self.tokenizer.tokenize(bytes.decode("utf-8")) - def newline_result(self): - self.n_from_end_tokens += 1 - self.n_from_start_tokens += 1 - self.n_to_end_tokens += 1 - self.n_to_start_tokens +=1 - - return (Equal(self.n_from_start_tokens - 1, - self.n_from_end_tokens, - self.n_to_start_tokens - 1, - self.n_from_start_tokens), - [Token('\n')], - [Token('\n')]) - - def to_operations(self): - parmoves = [] - [print(diff) for diff in self.diff["diff"][0:5]] + for entry in self.diff["diff"]: + + entry["text"] += "\n" + text = entry["text"] offset = entry["offset"] + if offset["from"] and entry.get("lineNumber") is not None : + if entry['type'] in [0, 2, 3, 4]: + self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode()) if offset["to"]: - self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + if entry['type'] in [0, 1, 3, 5]: + self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode()) + + + # add back the newline - text = entry["text"] - # ignore empty diffs. They don't have any tokens - if len(text) == 0: - continue # this is the first byte of the line in the 'from' revision. from_start_line = entry["offset"]["from"] # this is the first byte of the line in the 'to' revision. to_start_line = entry["offset"]["to"] - + if entry["type"] == 0: yield from self.doEqual(text, offset) - yield self.newline_result() # a line included in the 'to' revision, but not in the 'from' revision elif entry["type"] == 1: yield from self.doInsert(text, offset) - yield self.newline_result() # a line included in the 'from' revision, but not in the 'to' revision elif entry["type"] == 2: yield from self.doDelete(text, offset) - yield self.newline_result() elif entry["type"] == 3: yield from self.doHighlightRange( text, entry["highlightRanges"], offset, entry["lineNumber"] ) - yield self.newline_result() elif entry["type"] == 4: self.par_move_dict[entry["moveInfo"]["id"]] = entry @@ -141,34 +188,37 @@ class DiffToOperationMap: linkId = entry["moveInfo"]["linkId"] if linkId in self.par_move_dict: yield from self.doParMove(entry, self.par_move_dict[linkId]) - yield self.newline_result() - else: - # we need to count the tokens in the from revision so token index is correct - self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) - self.n_from_start_tokens += len( - self.tokenize(entry["text"].encode()) - ) + # we need to count the tokens in the from revision so token index is correct + # self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) + # self.n_from_start_tokens += len( + # self.tokenize(entry["text"].encode()) + # ) + elif entry["type"] == 5: linkId = entry["moveInfo"]["linkId"] if linkId in self.par_move_dict: yield from self.doParMove(self.par_move_dict[linkId], entry) - yield self.newline_result() else: self.par_move_dict[entry["moveInfo"]["id"]] = entry # call doHighlightRange just to update the token indices - offset = { - "from": self.n_from_end_tokens, - "to": entry["offset"]["to"], - } - res = self.doHighlightRange( - entry["text"], - entry["highlightRanges"], - offset, - entry["lineNumber"], - update_idx="to", - ) - list(res) + # offset = { + # "from": self.n_from_end_tokens, + # "to": entry["offset"]["to"], + # } + # res = self.doHighlightRange( + # entry["text"], + # entry["highlightRanges"], + # offset, + # entry["lineNumber"], + # update_idx="to", + # ) + # list(res) + # self.n_to_end_tokens += len(self.tokenize(entry["text"].encode())) + # self.n_to_start_tokens += len( + # self.tokenize(entry["text"].encode()) + # ) + else: # The 'type' isn't one of the known raise ValueError(d) @@ -180,99 +230,100 @@ class DiffToOperationMap: # strictly increasing, while the "from" segments should merely be # non-overlapping. - def doEqual(self, equal_segment, offset, update_idx="all", type=str): - if type is str: + def doEqual(self, equal_segment, offset, update_idx="all"): + # if from_token_start is None: + # from_token_start = self.n_from_start_tokens + # if to_token_start is None: + # to_token_start = self.n_to_start_tokens + + if isinstance(equal_segment, str): equal_bytes = equal_segment.encode() - elif type is bytes: + elif isinstance(equal_segment, bytes): equal_bytes = equal_segment else: raise ValueError(equal_segment) tokens = self.tokenize(equal_bytes) n_tokens = len(tokens) - n_from_end_tokens = self.n_from_start_tokens + n_tokens - n_to_end_tokens = self.n_to_start_tokens + n_tokens - # we need to keep track of the to and from last end bytes - self.from_last_end_bytes = offset["from"] + len(equal_bytes) - self.to_last_end_bytes = offset["to"] + len(equal_bytes) + + # token_offset = self.get_token_offset(offset) + + # n_from_end_tokens = token_offset['from'] + n_tokens + # n_to_end_tokens = token_offset['to'] + n_tokens + yield ( Equal( - self.n_from_start_tokens, - n_from_end_tokens, - self.n_to_start_tokens, - n_to_end_tokens, + offset['from'], + None, + offset['to'], + None, ), tokens, tokens, ) - if update_idx in ["from", "all"]: - self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens + # if update_idx in ["from", "all"]: + # self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens - if update_idx in ["to", "all"]: - self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens + # if update_idx in ["to", "all"]: + # self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens - self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens - self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens + # self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens + # self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens - def doInsert(self, insert_segment, offset, update_idx="all", type=str): - if type is str: + def doInsert(self, insert_segment, offset, update_idx="all"): + if isinstance(insert_segment, str): insert_bytes = insert_segment.encode() - elif type is bytes: + elif isinstance(insert_segment, bytes): insert_bytes = insert_segment else: raise ValueError(insert_segment) tokens = self.tokenize(insert_bytes) - n_tokens = len(tokens) - n_to_end_tokens = self.n_to_start_tokens + n_tokens - self.to_last_end_bytes = offset["to"] + len(insert_bytes) + # n_tokens = len(tokens) + # token_offset = self.get_token_offset(offset) + # n_to_end_tokens = token_offset['to'] + n_tokens yield ( Insert( - self.n_from_start_tokens, - self.n_from_start_tokens, - self.n_to_start_tokens, - n_to_end_tokens, + None, + None, + offset['to'], + None, ), [], tokens, ) # We have now used more of the "to" tokens. - if update_idx in ["to", "all"]: - self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens - - self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens + #self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens def doDelete(self, delete_segment, offset, update_idx="all", type=str): - if type is str: + if isinstance(delete_segment, str): delete_bytes = delete_segment.encode() - elif type is bytes: + elif isinstance(delete_segment, bytes): delete_bytes = delete_segment else: raise ValueError(delete_segment) tokens = self.tokenize(delete_bytes) - n_tokens = len(tokens) + # n_tokens = len(tokens) + + # token_offset = self.get_token_offset(offset) + # n_from_end_tokens = token_offset['from'] + n_tokens - n_from_end_tokens = self.n_from_start_tokens + n_tokens - self.from_last_end_bytes = offset["from"] + len(delete_bytes) yield ( Delete( - self.n_from_start_tokens, - n_from_end_tokens, - self.n_to_start_tokens, - self.n_to_start_tokens, + offset['from'], + None, + None, + None ), tokens, [], ) - # We have now used more of the "from" tokens. - if update_idx in ["from", "all"]: - self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens - self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens + #self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens def doHighlightRange( - self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all" - ): + self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"): + # The text field is an overlapping mix of both the from and to, # so we need to handle it highlight-by-highlight. # there can be gaps between highlight segments. @@ -283,11 +334,15 @@ class DiffToOperationMap: # it's possible for offset['to'] to be null. # we can get it from the line number? - + update_linenumber_map = True if offset["to"] is None: - offset["to"] = self.from_byte_token_index_map[ - find_greatest_le_key(lineNumber, self.from_byte_token_index_map) - ] + keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1 + if keyidx > 0: + print(self.to_linenumber_bytes_map) + key = self.to_linenumber_bytes_map.keys()[keyidx] + offset["to"] = self.to_linenumber_bytes_map[key] + else: + offset["to"] = 0 highlight_offset = offset # note that diffs are token-level, but the indexes are byte-level @@ -299,10 +354,12 @@ class DiffToOperationMap: equal_bytes = highlight_bytes[highlight_end:highlight_start] n_equal_bytes = len(equal_bytes) yield from self.doEqual( - equal_bytes, highlight_offset, update_idx=update_idx, type=bytes + equal_bytes, highlight_offset, update_idx=update_idx ) highlight_offset["from"] += n_equal_bytes highlight_offset["to"] += n_equal_bytes + if update_linenumber_map: + self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] # handle highlighted insert / delete highlight_end = highlight_start + highlightRange["length"] @@ -311,12 +368,14 @@ class DiffToOperationMap: if highlightRange["type"] == 0: yield from self.doInsert( - range_bytes, highlight_offset, update_idx=update_idx, type=bytes + range_bytes, highlight_offset, update_idx=update_idx ) highlight_offset["to"] += n_range_bytes + if update_linenumber_map: + self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] elif highlightRange["type"] == 1: yield from self.doDelete( - range_bytes, highlight_offset, update_idx=update_idx, type=bytes + range_bytes, highlight_offset, update_idx=update_idx ) highlight_offset["from"] += n_range_bytes else: @@ -325,46 +384,14 @@ class DiffToOperationMap: # handle the rest of the line which is equal if highlight_end < len(highlight_bytes): range_bytes = highlight_bytes[highlight_end:] - yield from self.doEqual(range_bytes, highlight_offset, type=bytes) + yield from self.doEqual(range_bytes, highlight_offset) def doParMove(self, from_diff, to_diff): # the tricky part here is to put the tokens in the right spots. from_byte_start = from_diff["offset"]["from"] - # as of python 3.7 dictionaries are in insertion order. So - # we can just find the first key that's greater - - # since the paragraph is removed in the "from" version, the index it is removed from - # will be *after* the - if len(self.from_byte_token_index_map) > 0: - if ( - from_byte_start >= self.from_last_end_bytes - ): # if the from paragraph is at the end - from_token_start = next( - reversed(self.from_byte_token_index_map.values()) - ) - else: - key = find_greatest_le_key( - from_byte_start, self.from_byte_token_index_map - ) - from_token_start = self.from_byte_token_index_map[key] - else: - from_token_start = 0 - - if len(self.to_byte_token_index_map) > 0: - # get the to token index - to_byte_start = to_diff["offset"]["to"] - if to_byte_start >= self.to_last_end_bytes: - to_token_start = next(reversed(self.to_byte_token_index_map.values())) - else: - key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map) - to_token_start = self.to_byte_token_index_map[key] - else: - to_token_start = 0 - - # now we set the state and apply the highlights - self.n_from_start_tokens = self.n_from_end_tokens = from_token_start - self.n_to_start_tokens = self.n_to_end_tokens = to_token_start + to_byte_start = to_diff["offset"]["to"] offset = {"from": from_byte_start, "to": to_byte_start} + # we need to cache the indexes; replace them; then restore yield from self.doHighlightRange( to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"] ) @@ -397,35 +424,63 @@ class WikiDiffMatcher: diff = next(self.diffs) diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer) - diffops = list(zip(*diffToOperationsMapper.to_operations())) + diffops = list(diffToOperationsMapper.to_operations()) - if not diffops: - self.last_tokens = [] - return [], [], [] + # this happens when revisions are actually equal. + if len(diffops) == 0: + self.last_tokens = self.tokenizer.tokenize(text) + ops = [Equal(0, len(self.last_tokens), + 0, len(self.last_tokens))] + return ops, self.last_tokens, self.last_tokens - diffops = ( - operations, - aseq, - bseq, - ) = diffops + # we get back the byte indices; now we transform to token indices - aseq = list(aseq) + diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)) + aorder_ops = [] + token_offset = 0 + _, aseq, _ = list(zip( * diffops)) - # aseq/bseq can be out of order, we need to sort it by a1/b1 index. - indices = list(range(len(aseq))) - indices.sort(key=lambda i: operations[i].a1) - aseq = [aseq[i] for i in indices] + for op, tokens, _ in diffops: + a1 = token_offset + if isinstance(op, Equal) or isinstance(op, Delete): + token_offset += len(tokens) + a2 = token_offset + aorder_ops.append(type(op)(a1, + a2, + op.b1, + op.b1)) + else: + aorder_ops.append(Insert(a1, + a1, + op.b1, + op.b1)) - bseq = list(bseq) - indices = list(range(len(bseq))) - indices.sort(key=lambda i: operations[i].b1) - bseq = [bseq[i] for i in indices] + _, aseq, bseq = zip(* diffops) + diffops = list(zip(aorder_ops, aseq, bseq)) + diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)) + _, _, bseq = list(zip(* diffops)) + border_ops = [] + token_offset = 0 + for op, _, tokens in diffops: + b1 = token_offset + if isinstance(op, Equal) or isinstance(op, Insert): + token_offset += len(tokens) + b2 = token_offset + border_ops.append(type(op)(op.a1, + op.a2, + b1, + b2)) + else: + border_ops.append(type(op)(op.a1, + op.a2, + b1, + b1)) + + self.previous_text = text self.last_tokens = list(chain.from_iterable(aseq)) tokens = list(chain.from_iterable(bseq)) - self.previous_text = text - - return operations, self.last_tokens, tokens + return border_ops, self.last_tokens, tokens def processor(self, *args, **kwargs): return self.Processor(self.diffs, self.tokenizer)