almost there. working out edge cases.

This commit is contained in:
Nathan TeBlunthuis 2025-07-03 21:32:44 -07:00
parent cf1fb61a84
commit 4654911533
3 changed files with 345 additions and 231 deletions

View File

@ -12,6 +12,7 @@ dependencies = [
"mwtypes>=0.4.0",
"mwxml>=0.3.6",
"pyarrow>=20.0.0",
"sortedcontainers>=2.4.0",
"yamlconf>=0.2.6",
]
@ -22,6 +23,7 @@ deltas = { git = "https://github.com/groceryheist/deltas" }
[dependency-groups]
dev = [
"ipython>=8.18.1",
"pandas>=2.1.0",
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",

View File

@ -2,7 +2,7 @@
import asyncio
import subprocess
from functools import partial
import re
import pytest
import pytest_asyncio
from typing import List
@ -30,11 +30,10 @@ async def start_stop_server():
def assert_equal_enough(tokens:List[Token], rev):
# the tokens exclude newlines
# we allow extra whitespace at the beginning or end
token_doc = ''.join(str(t) for t in tokens).strip()
while '\n\n' in token_doc:
token_doc = token_doc.replace('\n\n','\n')
while '\n\n' in rev:
rev = rev.replace('\n\n','\n').strip()
token_doc = ''.join(str(t) for t in tokens)
token_doc = re.sub(r'\s+', ' ', token_doc).strip()
rev = re.sub(r'\s+', ' ', rev).strip()
print(token_doc, file = open('token','w'))
print(rev, file = open('rev','w'))
assert token_doc == rev
@ -62,7 +61,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token
# if the last line is an equal
if first_unequal_token is None:
first_unequal_token = ops[-1].b2
assert n_equal_lines == expected_equal_lines
# check that there are no gaps and the number is as expected
@ -76,9 +74,8 @@ def test_equality():
diff_processor = matcher.processor()
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev1 + " ")
assert len(ops) == 258
assert len(ops) == 257
for op in ops[:-2]:
print(op)
assert isinstance(op, Equal)
# note that the whitespace token does not result in a token according to wikitext_split
@ -152,44 +149,48 @@ def test_delete():
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
initial_equal_tokens = 0
first_nondelete_token = None
n_deletes = 0
n_deleted_tokens = 0
last_b2 = initial_equal_tokens
initial_equal_lines = 256
initial_equal_tokens = 9911
for i, op in enumerate(ops):
if initial_equal_lines > 0:
assert isinstance(op, Equal)
else:
break
initial_equal_lines -= 1
assert initial_equal_lines == 0
assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens
initial_equal_lines = 4
initial_equal_tokens = 14
last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens)
first_noninsert_token = initial_equal_tokens
last_non_delete = False
last_delete = False
last_insert = False
idx = 0
n_non_delete = 0
last_delete_idx = 0
for op in ops[initial_equal_lines:]:
idx += 1
# deletes are interleaved with Equal newlines.
if not isinstance(op, Delete):
if last_non_delete:
first_nondelete_token = op.a1
break
last_non_delete = True
else:
last_non_delete = False
if last_non_delete:
if isinstance(op, Delete):
n_deletes += 1
n_deleted_tokens += op.a2 - last_b2
last_b2 = op.a2
assert n_deletes == 2
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
n_deleted_tokens += op.a2 - op.a1
last_delete = True
last_delete_idx = idx
# we need to add back a newline when we have a delete
else:
n_non_delete += 1
if not last_delete and first_nondelete_token is None:
first_nondelete_token = op.a1
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
expected_equal_lines=252,
expected_equal_tokens=9765)
if n_non_delete:
last_b2 = op.b2
assert n_deletes == 4
assert n_deleted_tokens == 320
assert idx == len(ops)
# first lets test that we properly build the operations.
@ -204,14 +205,8 @@ def test_addition():
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
even = True
for op in ops:
if even:
assert isinstance(op, Insert)
even = False
else:
assert isinstance(op, Equal)
even = True
assert isinstance(op, Insert)
assert_equal_enough(b, rev1)
@ -221,8 +216,8 @@ def test_addition():
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
ops = list(ops)
initial_equal_lines = 256
initial_equal_tokens = 9487
initial_equal_lines = 255
initial_equal_tokens = 9614
last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens)
@ -232,16 +227,21 @@ def test_addition():
n_inserted_tokens = 0
last_b2 = last_insert_b2 = initial_equal_tokens
idx = 0
print(ops[initial_equal_lines:])
last_insert = False
for op in ops[initial_equal_lines:]:
if isinstance(op, Insert):
n_inserts += 1
n_inserted_tokens += op.b2 - op.b1
last_insert_b2 = op.b2
last_insert = True
elif last_insert:
assert isinstance(op, Equal)
last_b2 = op.b2
assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
assert n_inserts == 2
assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296
assert n_inserts == 4
def test_paragraph_move():
rev1 = open("test/test_diff_revisions/1295229484").read()
@ -269,6 +269,63 @@ def test_paragraph_move_and_change():
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_infobox():
rev1 = open("test/test_diff_revisions/test_infobox_from").read()
rev2 = open("test/test_diff_revisions/test_infobox_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
# note that a and b are constructed from the diffs.
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
def test_leading_whitespace():
rev1 = open("test/test_diff_revisions/test_leading_ws_from").read()
rev2 = open("test/test_diff_revisions/test_leading_ws_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
# note that a and b are constructed from the diffs.
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
# def test_whitespace_2():
# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
# matcher = WikiDiffMatcher([rev1,rev2])
# diff_processor = matcher.processor()
# # note that a and b are constructed from the diffs.
# # so they reflect the state of the text according to the diff processor
# ops, a, b = diff_processor.process(rev1)
# ops, a, b = diff_processor.process(rev2)
# assert_equal_enough(b, rev2)
# assert_equal_enough(a, rev1)
def test_actually_equal():
rev1 = open("test/test_diff_revisions/1285792388").read()
# whitespace is added because exact identity reverts do not result in diffs.
matcher = WikiDiffMatcher([rev1,rev1])
diff_processor = matcher.processor()
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev1)
assert len(ops) == 1
assert isinstance(ops[0], Equal)
# note that the whitespace token does not result in a token according to wikitext_split
# compare the tokens based on the diffs to the baseline
# whitespace differences are allowed
assert_equal_enough(b, rev1)
assert_equal_enough(a, rev1)
# slow test
def test_diff_consistency():
from mwxml import Dump

View File

@ -3,25 +3,37 @@ import sys
from collections import namedtuple
from itertools import chain
from typing import Dict, Generator, List, Optional, Tuple
from sortedcontainers import SortedDict
import requests
from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token,
RegexTokenizer, tokenizers)
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
RegexTokenizer, Token, tokenizers)
TOKENIZER = tokenizers.wikitext_split
# def find_greatest_le_key(target_key, data_dict):
# found_key = None
# for key in data_dict: # Iterates over keys in insertion order (which is sorted)
# if key <= target_key:
# found_key = (
# key # This is the largest key found so far that satisfies the condition
# )
# else:
# # Since the dictionary is sorted, if key > target_key,
# # all subsequent keys will also be > target_key.
# return found_key or key
# def find_smallest_gt_key(target_key, data_dict):
# found_key = None
# for key in reversed(data_dict): # Iterates over keys in insertion order (which is sorted)
# if key >= target_key:
# found_key = (
# key # This is the largest key found so far that satisfies the condition
# )
# else:
# # Since the dictionary is sorted, if key > target_key,
# # all subsequent keys will also be > target_key.
# return found_key or key
def find_greatest_le_key(target_key, data_dict):
found_key = None
for key in data_dict: # Iterates over keys in insertion order (which is sorted)
if key <= target_key:
found_key = (
key # This is the largest key found so far that satisfies the condition
)
else:
# Since the dictionary is sorted, if key > target_key,
# all subsequent keys will also be > target_key.
return found_key or key
def compute_diffs(url: str, texts: list[str]) -> list:
@ -61,79 +73,114 @@ class DiffToOperationMap:
self.diff = json.loads(diff)
# the code below is designed to work in bytes because that's how wikidiff2 indexes
self.from_last_end_bytes = 0
self.from_last_to_bytes = 0
self.n_from_start_tokens = 0
self.n_from_end_tokens = 0
self.n_from_start_tokens = 0
self.n_to_start_tokens = 0
self.last_to_start_line = 0
self.last_from_start_line = 0
self.from_last_end_bytes = 0
self.to_last_end_bytes = 0
# self.from_last_end_bytes = 0
# self.from_last_to_bytes = 0
# self.n_from_start_tokens = 0
# self.n_from_end_tokens = 0
# self.n_from_start_tokens = 0
# self.n_to_start_tokens = 0
# self.from_last_end_bytes = 0
# self.to_last_end_bytes = 0
# keeps track of the number of tokens seen so far
# to avoid repeated tokenization
self.from_byte_token_index_map: Dict[int, int] = {}
self.to_byte_token_index_map: Dict[int, int] = {}
# self.from_byte_token_index_map: SortedDict[int, int] = SortedDict()
# self.to_byte_token_index_map: SortedDict[int, int] = SortedDict()
self.par_move_dict = {}
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
self.to_linenumber_bytes_map = {}
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
# def get_token_offset(self, byte_offset):
# from_token_start = None
# to_token_start = None
# from_last_end_bytes = self.from_byte_token_index_map.keys()[-1]
# to_last_end_bytes = self.to_byte_token_index_map.keys()[-1]
# if byte_offset['from'] is not None:
# if byte_offset['from'] < self.from_byte_token_index_map.values()[0]:
# from_token_start = 0
# else:
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from'])
# # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below?
# if key > from_last_end_bytes:
# from_token_start = self.from_byte_token_index_map[from_last_end_bytes]
# else:
# from_token_
# if byte_offset['to'] is not None:
# if byte_offset['to'] < self.to_byte_token_index_map.values()[0]:
# to_token_start = 0
# else:
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to'])
# if key >= from
# if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0:
# if (
# byte_offset['from'] >= self.from_last_end_bytes
# ): # if the from paragraph is at the end
# from_token_start = next(
# reversed(self.from_byte_token_index_map.values())
# )
# else:
# key = find_greatest_le_key(
# byte_offset['from'], self.from_byte_token_index_map
# )
# from_token_start = self.from_byte_token_index_map[key]
# else:
# from_token_start = 0
# to_offset = None
# if byte_offset['to'] is not None:
# if len(self.to_byte_token_index_map) > 0:
# if to_byte_start >= self.to_last_end_bytes:
# to_token_start = next(reversed(self.to_byte_token_index_map.values()))
# else:
# key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map)
# to_token_start = self.to_byte_token_index_map[key]
# else:
# to_token_start = 0
# return {'from': from_token_start,
# 'to': to_token_start}
def tokenize(self, bytes):
return self.tokenizer.tokenize(bytes.decode("utf-8"))
def newline_result(self):
self.n_from_end_tokens += 1
self.n_from_start_tokens += 1
self.n_to_end_tokens += 1
self.n_to_start_tokens +=1
return (Equal(self.n_from_start_tokens - 1,
self.n_from_end_tokens,
self.n_to_start_tokens - 1,
self.n_from_start_tokens),
[Token('\n')],
[Token('\n')])
def to_operations(self):
parmoves = []
[print(diff) for diff in self.diff["diff"][0:5]]
for entry in self.diff["diff"]:
entry["text"] += "\n"
text = entry["text"]
offset = entry["offset"]
if offset["from"] and entry.get("lineNumber") is not None :
if entry['type'] in [0, 2, 3, 4]:
self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode())
if offset["to"]:
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"]
if entry['type'] in [0, 1, 3, 5]:
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode())
# add back the newline
text = entry["text"]
# ignore empty diffs. They don't have any tokens
if len(text) == 0:
continue
# this is the first byte of the line in the 'from' revision.
from_start_line = entry["offset"]["from"]
# this is the first byte of the line in the 'to' revision.
to_start_line = entry["offset"]["to"]
if entry["type"] == 0:
yield from self.doEqual(text, offset)
yield self.newline_result()
# a line included in the 'to' revision, but not in the 'from' revision
elif entry["type"] == 1:
yield from self.doInsert(text, offset)
yield self.newline_result()
# a line included in the 'from' revision, but not in the 'to' revision
elif entry["type"] == 2:
yield from self.doDelete(text, offset)
yield self.newline_result()
elif entry["type"] == 3:
yield from self.doHighlightRange(
text, entry["highlightRanges"], offset, entry["lineNumber"]
)
yield self.newline_result()
elif entry["type"] == 4:
self.par_move_dict[entry["moveInfo"]["id"]] = entry
@ -141,34 +188,37 @@ class DiffToOperationMap:
linkId = entry["moveInfo"]["linkId"]
if linkId in self.par_move_dict:
yield from self.doParMove(entry, self.par_move_dict[linkId])
yield self.newline_result()
else:
# we need to count the tokens in the from revision so token index is correct
self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
self.n_from_start_tokens += len(
self.tokenize(entry["text"].encode())
)
# we need to count the tokens in the from revision so token index is correct
# self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
# self.n_from_start_tokens += len(
# self.tokenize(entry["text"].encode())
# )
elif entry["type"] == 5:
linkId = entry["moveInfo"]["linkId"]
if linkId in self.par_move_dict:
yield from self.doParMove(self.par_move_dict[linkId], entry)
yield self.newline_result()
else:
self.par_move_dict[entry["moveInfo"]["id"]] = entry
# call doHighlightRange just to update the token indices
offset = {
"from": self.n_from_end_tokens,
"to": entry["offset"]["to"],
}
res = self.doHighlightRange(
entry["text"],
entry["highlightRanges"],
offset,
entry["lineNumber"],
update_idx="to",
)
list(res)
# offset = {
# "from": self.n_from_end_tokens,
# "to": entry["offset"]["to"],
# }
# res = self.doHighlightRange(
# entry["text"],
# entry["highlightRanges"],
# offset,
# entry["lineNumber"],
# update_idx="to",
# )
# list(res)
# self.n_to_end_tokens += len(self.tokenize(entry["text"].encode()))
# self.n_to_start_tokens += len(
# self.tokenize(entry["text"].encode())
# )
else:
# The 'type' isn't one of the known
raise ValueError(d)
@ -180,99 +230,100 @@ class DiffToOperationMap:
# strictly increasing, while the "from" segments should merely be
# non-overlapping.
def doEqual(self, equal_segment, offset, update_idx="all", type=str):
if type is str:
def doEqual(self, equal_segment, offset, update_idx="all"):
# if from_token_start is None:
# from_token_start = self.n_from_start_tokens
# if to_token_start is None:
# to_token_start = self.n_to_start_tokens
if isinstance(equal_segment, str):
equal_bytes = equal_segment.encode()
elif type is bytes:
elif isinstance(equal_segment, bytes):
equal_bytes = equal_segment
else:
raise ValueError(equal_segment)
tokens = self.tokenize(equal_bytes)
n_tokens = len(tokens)
n_from_end_tokens = self.n_from_start_tokens + n_tokens
n_to_end_tokens = self.n_to_start_tokens + n_tokens
# we need to keep track of the to and from last end bytes
self.from_last_end_bytes = offset["from"] + len(equal_bytes)
self.to_last_end_bytes = offset["to"] + len(equal_bytes)
# token_offset = self.get_token_offset(offset)
# n_from_end_tokens = token_offset['from'] + n_tokens
# n_to_end_tokens = token_offset['to'] + n_tokens
yield (
Equal(
self.n_from_start_tokens,
n_from_end_tokens,
self.n_to_start_tokens,
n_to_end_tokens,
offset['from'],
None,
offset['to'],
None,
),
tokens,
tokens,
)
if update_idx in ["from", "all"]:
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
# if update_idx in ["from", "all"]:
# self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
if update_idx in ["to", "all"]:
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
# if update_idx in ["to", "all"]:
# self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
# self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
# self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
def doInsert(self, insert_segment, offset, update_idx="all", type=str):
if type is str:
def doInsert(self, insert_segment, offset, update_idx="all"):
if isinstance(insert_segment, str):
insert_bytes = insert_segment.encode()
elif type is bytes:
elif isinstance(insert_segment, bytes):
insert_bytes = insert_segment
else:
raise ValueError(insert_segment)
tokens = self.tokenize(insert_bytes)
n_tokens = len(tokens)
n_to_end_tokens = self.n_to_start_tokens + n_tokens
self.to_last_end_bytes = offset["to"] + len(insert_bytes)
# n_tokens = len(tokens)
# token_offset = self.get_token_offset(offset)
# n_to_end_tokens = token_offset['to'] + n_tokens
yield (
Insert(
self.n_from_start_tokens,
self.n_from_start_tokens,
self.n_to_start_tokens,
n_to_end_tokens,
None,
None,
offset['to'],
None,
),
[],
tokens,
)
# We have now used more of the "to" tokens.
if update_idx in ["to", "all"]:
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
#self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
def doDelete(self, delete_segment, offset, update_idx="all", type=str):
if type is str:
if isinstance(delete_segment, str):
delete_bytes = delete_segment.encode()
elif type is bytes:
elif isinstance(delete_segment, bytes):
delete_bytes = delete_segment
else:
raise ValueError(delete_segment)
tokens = self.tokenize(delete_bytes)
n_tokens = len(tokens)
# n_tokens = len(tokens)
# token_offset = self.get_token_offset(offset)
# n_from_end_tokens = token_offset['from'] + n_tokens
n_from_end_tokens = self.n_from_start_tokens + n_tokens
self.from_last_end_bytes = offset["from"] + len(delete_bytes)
yield (
Delete(
self.n_from_start_tokens,
n_from_end_tokens,
self.n_to_start_tokens,
self.n_to_start_tokens,
offset['from'],
None,
None,
None
),
tokens,
[],
)
# We have now used more of the "from" tokens.
if update_idx in ["from", "all"]:
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
#self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
def doHighlightRange(
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"
):
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"):
# The text field is an overlapping mix of both the from and to,
# so we need to handle it highlight-by-highlight.
# there can be gaps between highlight segments.
@ -283,11 +334,15 @@ class DiffToOperationMap:
# it's possible for offset['to'] to be null.
# we can get it from the line number?
update_linenumber_map = True
if offset["to"] is None:
offset["to"] = self.from_byte_token_index_map[
find_greatest_le_key(lineNumber, self.from_byte_token_index_map)
]
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
if keyidx > 0:
print(self.to_linenumber_bytes_map)
key = self.to_linenumber_bytes_map.keys()[keyidx]
offset["to"] = self.to_linenumber_bytes_map[key]
else:
offset["to"] = 0
highlight_offset = offset
# note that diffs are token-level, but the indexes are byte-level
@ -299,10 +354,12 @@ class DiffToOperationMap:
equal_bytes = highlight_bytes[highlight_end:highlight_start]
n_equal_bytes = len(equal_bytes)
yield from self.doEqual(
equal_bytes, highlight_offset, update_idx=update_idx, type=bytes
equal_bytes, highlight_offset, update_idx=update_idx
)
highlight_offset["from"] += n_equal_bytes
highlight_offset["to"] += n_equal_bytes
if update_linenumber_map:
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
# handle highlighted insert / delete
highlight_end = highlight_start + highlightRange["length"]
@ -311,12 +368,14 @@ class DiffToOperationMap:
if highlightRange["type"] == 0:
yield from self.doInsert(
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
range_bytes, highlight_offset, update_idx=update_idx
)
highlight_offset["to"] += n_range_bytes
if update_linenumber_map:
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
elif highlightRange["type"] == 1:
yield from self.doDelete(
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
range_bytes, highlight_offset, update_idx=update_idx
)
highlight_offset["from"] += n_range_bytes
else:
@ -325,46 +384,14 @@ class DiffToOperationMap:
# handle the rest of the line which is equal
if highlight_end < len(highlight_bytes):
range_bytes = highlight_bytes[highlight_end:]
yield from self.doEqual(range_bytes, highlight_offset, type=bytes)
yield from self.doEqual(range_bytes, highlight_offset)
def doParMove(self, from_diff, to_diff):
# the tricky part here is to put the tokens in the right spots.
from_byte_start = from_diff["offset"]["from"]
# as of python 3.7 dictionaries are in insertion order. So
# we can just find the first key that's greater
# since the paragraph is removed in the "from" version, the index it is removed from
# will be *after* the
if len(self.from_byte_token_index_map) > 0:
if (
from_byte_start >= self.from_last_end_bytes
): # if the from paragraph is at the end
from_token_start = next(
reversed(self.from_byte_token_index_map.values())
)
else:
key = find_greatest_le_key(
from_byte_start, self.from_byte_token_index_map
)
from_token_start = self.from_byte_token_index_map[key]
else:
from_token_start = 0
if len(self.to_byte_token_index_map) > 0:
# get the to token index
to_byte_start = to_diff["offset"]["to"]
if to_byte_start >= self.to_last_end_bytes:
to_token_start = next(reversed(self.to_byte_token_index_map.values()))
else:
key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map)
to_token_start = self.to_byte_token_index_map[key]
else:
to_token_start = 0
# now we set the state and apply the highlights
self.n_from_start_tokens = self.n_from_end_tokens = from_token_start
self.n_to_start_tokens = self.n_to_end_tokens = to_token_start
to_byte_start = to_diff["offset"]["to"]
offset = {"from": from_byte_start, "to": to_byte_start}
# we need to cache the indexes; replace them; then restore
yield from self.doHighlightRange(
to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
)
@ -397,35 +424,63 @@ class WikiDiffMatcher:
diff = next(self.diffs)
diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)
diffops = list(zip(*diffToOperationsMapper.to_operations()))
diffops = list(diffToOperationsMapper.to_operations())
if not diffops:
self.last_tokens = []
return [], [], []
# this happens when revisions are actually equal.
if len(diffops) == 0:
self.last_tokens = self.tokenizer.tokenize(text)
ops = [Equal(0, len(self.last_tokens),
0, len(self.last_tokens))]
return ops, self.last_tokens, self.last_tokens
diffops = (
operations,
aseq,
bseq,
) = diffops
# we get back the byte indices; now we transform to token indices
aseq = list(aseq)
diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1))
aorder_ops = []
token_offset = 0
_, aseq, _ = list(zip( * diffops))
# aseq/bseq can be out of order, we need to sort it by a1/b1 index.
indices = list(range(len(aseq)))
indices.sort(key=lambda i: operations[i].a1)
aseq = [aseq[i] for i in indices]
for op, tokens, _ in diffops:
a1 = token_offset
if isinstance(op, Equal) or isinstance(op, Delete):
token_offset += len(tokens)
a2 = token_offset
aorder_ops.append(type(op)(a1,
a2,
op.b1,
op.b1))
else:
aorder_ops.append(Insert(a1,
a1,
op.b1,
op.b1))
bseq = list(bseq)
indices = list(range(len(bseq)))
indices.sort(key=lambda i: operations[i].b1)
bseq = [bseq[i] for i in indices]
_, aseq, bseq = zip(* diffops)
diffops = list(zip(aorder_ops, aseq, bseq))
diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1))
_, _, bseq = list(zip(* diffops))
border_ops = []
token_offset = 0
for op, _, tokens in diffops:
b1 = token_offset
if isinstance(op, Equal) or isinstance(op, Insert):
token_offset += len(tokens)
b2 = token_offset
border_ops.append(type(op)(op.a1,
op.a2,
b1,
b2))
else:
border_ops.append(type(op)(op.a1,
op.a2,
b1,
b1))
self.previous_text = text
self.last_tokens = list(chain.from_iterable(aseq))
tokens = list(chain.from_iterable(bseq))
self.previous_text = text
return operations, self.last_tokens, tokens
return border_ops, self.last_tokens, tokens
def processor(self, *args, **kwargs):
return self.Processor(self.diffs, self.tokenizer)