WIP: fixing bugs and adding newlines to output.

This commit is contained in:
Nathan TeBlunthuis 2025-07-02 13:31:32 -07:00
parent c4acc711d2
commit cf1fb61a84
3 changed files with 297 additions and 97 deletions

View File

@ -1,2 +1,2 @@
#!/usr/bin/env bash #!/usr/bin/env bash
uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys

View File

@ -8,30 +8,36 @@ import pytest_asyncio
from typing import List from typing import List
from deltas import Delete, Equal, Insert, wikitext_split from deltas import Delete, Equal, Insert, wikitext_split
from mwpersistence import Token from mwpersistence import Token
from wiki_diff_matcher import WikiDiffMatcher from wiki_diff_matcher import WikiDiffMatcher
@pytest_asyncio.fixture(scope="module", autouse=True)
@pytest_asyncio.fixture(scope="module")
async def start_stop_server(): async def start_stop_server():
print("starting server")
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000", proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
"wikidiff2_api.php", "wikidiff2_api.php",
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
# php needs a moment to actually start
await asyncio.sleep(0.1)
yield proc yield proc
stdout, stderr = await proc.communicate() print("stopping server")
print(stdout.encode())
print(stderr.encode())
proc.terminate() proc.terminate()
stdout, stderr = await proc.communicate()
print(stdout.decode())
print(stderr.decode())
def assert_equal_enough(tokens:List[Token], rev): def assert_equal_enough(tokens:List[Token], rev):
# the tokens exclude newlines # the tokens exclude newlines
# we allow extra whitespace at the beginning or end # we allow extra whitespace at the beginning or end
token_doc = ''.join(str(t) for t in tokens).strip() token_doc = ''.join(str(t) for t in tokens).strip()
rev_doc = rev.replace('\n','').strip() while '\n\n' in token_doc:
token_doc = token_doc.replace('\n\n','\n')
while '\n\n' in rev:
rev = rev.replace('\n\n','\n').strip()
print(token_doc, file = open('token','w')) print(token_doc, file = open('token','w'))
print(rev_doc, file = open('rev','w')) print(rev, file = open('rev','w'))
assert token_doc == rev_doc assert token_doc == rev
def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_tokens): def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_tokens):
@ -70,15 +76,59 @@ def test_equality():
diff_processor = matcher.processor() diff_processor = matcher.processor()
ops, a, b = diff_processor.process(rev1) ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev1 + " ") ops, a, b = diff_processor.process(rev1 + " ")
assert len(ops) == 129 assert len(ops) == 258
for op in ops[:-1]: for op in ops[:-2]:
assert isinstance(op, Equal) print(op)
assert isinstance(op, Equal)
# note that the whitespace token does not result in a token according to wikitext_split # note that the whitespace token does not result in a token according to wikitext_split
# compare the tokens based on the diffs to the baseline # compare the tokens based on the diffs to the baseline
# whitespace differences are allowed # whitespace differences are allowed
assert_equal_enough(b, rev1) assert_equal_enough(b, rev1)
def test_highlight_range_3():
rev1 = open("test/test_diff_revisions/test_highlight_3_from").read()
rev2 = open("test/test_diff_revisions/test_highlight_3_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_highlight_range_4():
rev1 = open("test/test_diff_revisions/test_highlight_4_from").read()
rev2 = open("test/test_diff_revisions/test_highlight_4_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_complex_diff():
rev1 = open("test/test_diff_revisions/test_complex_from").read()
rev2 = open("test/test_diff_revisions/test_complex_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_highlight_range_unicode():
rev1 = open("test/test_diff_revisions/test_unicode_highlight_from").read()
rev2 = open("test/test_diff_revisions/test_unicode_highlight_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_highlight_range(): def test_highlight_range():
rev1 = open("test/test_diff_revisions/1295229484_rangeedit0").read() rev1 = open("test/test_diff_revisions/1295229484_rangeedit0").read()
rev2 = open("test/test_diff_revisions/1295229484_rangeedit1").read() rev2 = open("test/test_diff_revisions/1295229484_rangeedit1").read()
@ -108,28 +158,38 @@ def test_delete():
n_deleted_tokens = 0 n_deleted_tokens = 0
last_b2 = initial_equal_tokens last_b2 = initial_equal_tokens
initial_equal_lines = 2 initial_equal_lines = 4
initial_equal_tokens = 12 initial_equal_tokens = 14
last_b2 = assert_correct_equal_section(ops, last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines, expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens) expected_equal_tokens=initial_equal_tokens)
first_noninsert_token = initial_equal_tokens first_noninsert_token = initial_equal_tokens
last_non_delete = False
idx = 0
for op in ops[initial_equal_lines:]: for op in ops[initial_equal_lines:]:
idx += 1
# deletes are interleaved with Equal newlines.
if not isinstance(op, Delete): if not isinstance(op, Delete):
first_nondelete_token = op.a1 if last_non_delete:
break first_nondelete_token = op.a1
n_deletes += 1 break
n_deleted_tokens += op.a2 - last_b2 last_non_delete = True
last_b2 = op.a2 else:
last_non_delete = False
if last_non_delete:
n_deletes += 1
n_deleted_tokens += op.a2 - last_b2
last_b2 = op.a2
assert n_deletes == 2 assert n_deletes == 2
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 316 assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + n_deletes:],
expected_equal_lines=126, last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
expected_equal_tokens=9323) expected_equal_lines=252,
expected_equal_tokens=9765)
# first lets test that we properly build the operations. # first lets test that we properly build the operations.
@ -144,8 +204,14 @@ def test_addition():
# so they reflect the state of the text according to the diff processor # so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1) ops, a, b = diff_processor.process(rev1)
even = True
for op in ops: for op in ops:
assert isinstance(op, Insert) if even:
assert isinstance(op, Insert)
even = False
else:
assert isinstance(op, Equal)
even = True
assert_equal_enough(b, rev1) assert_equal_enough(b, rev1)
@ -155,21 +221,26 @@ def test_addition():
assert_equal_enough(a, rev1) assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2) assert_equal_enough(b, rev2)
ops = list(ops) ops = list(ops)
initial_equal_lines = 128 initial_equal_lines = 256
initial_equal_tokens = 9359 initial_equal_tokens = 9487
last_b2 = assert_correct_equal_section(ops, last_b2 = assert_correct_equal_section(ops,
expected_equal_lines=initial_equal_lines, expected_equal_lines=initial_equal_lines,
expected_equal_tokens=initial_equal_tokens) expected_equal_tokens=initial_equal_tokens)
last_non_insert = False
first_noninsert_token = None first_noninsert_token = None
n_inserts = 0 n_inserts = 0
n_inserted_tokens = 0 n_inserted_tokens = 0
last_b2 = initial_equal_tokens last_b2 = last_insert_b2 = initial_equal_tokens
idx = 0
print(ops[initial_equal_lines:])
for op in ops[initial_equal_lines:]: for op in ops[initial_equal_lines:]:
n_inserts += 1 if isinstance(op, Insert):
n_inserted_tokens += op.b2 - last_b2 n_inserts += 1
n_inserted_tokens += op.b2 - op.b1
last_insert_b2 = op.b2
last_b2 = op.b2 last_b2 = op.b2
assert n_inserted_tokens == last_b2 - initial_equal_tokens == 292 assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
assert n_inserts == 2 assert n_inserts == 2
def test_paragraph_move(): def test_paragraph_move():
@ -195,6 +266,26 @@ def test_paragraph_move_and_change():
# so they reflect the state of the text according to the diff processor # so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1) ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2) ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1) assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
# slow test
def test_diff_consistency():
from mwxml import Dump
stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout
dump = Dump.from_file(stream)
for page in dump:
revisions = [rev.text for rev in page if rev.text]
matcher = WikiDiffMatcher(revisions)
diff_processor = matcher.processor()
last_rev = ""
for rev in revisions:
print(rev, file=open("test_unicode_highlight_to",'w'))
print(last_rev, file=open("test_unicode_highlight_from",'w'))
ops, a, b = diff_processor.process(rev)
#assert_equal_enough(a, last_rev)
assert_equal_enough(b, rev)
last_rev = rev

View File

@ -4,13 +4,15 @@ from collections import namedtuple
from itertools import chain from itertools import chain
from typing import Dict, Generator, List, Optional, Tuple from typing import Dict, Generator, List, Optional, Tuple
import requests import requests
from deltas import Equal, Insert, Delete, DiffEngine, Operation, RegexTokenizer, tokenizers from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token,
RegexTokenizer, tokenizers)
TOKENIZER = tokenizers.wikitext_split TOKENIZER = tokenizers.wikitext_split
def find_greatest_le_key(target_key, data_dict): def find_greatest_le_key(target_key, data_dict):
found_key = None
for key in data_dict: # Iterates over keys in insertion order (which is sorted) for key in data_dict: # Iterates over keys in insertion order (which is sorted)
if key <= target_key: if key <= target_key:
found_key = ( found_key = (
@ -62,6 +64,8 @@ class DiffToOperationMap:
self.from_last_end_bytes = 0 self.from_last_end_bytes = 0
self.from_last_to_bytes = 0 self.from_last_to_bytes = 0
self.n_from_start_tokens = 0 self.n_from_start_tokens = 0
self.n_from_end_tokens = 0
self.n_from_start_tokens = 0
self.n_to_start_tokens = 0 self.n_to_start_tokens = 0
self.last_to_start_line = 0 self.last_to_start_line = 0
self.last_from_start_line = 0 self.last_from_start_line = 0
@ -73,13 +77,35 @@ class DiffToOperationMap:
self.to_byte_token_index_map: Dict[int, int] = {} self.to_byte_token_index_map: Dict[int, int] = {}
self.par_move_dict = {} self.par_move_dict = {}
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
self.to_linenumber_bytes_map = {}
def tokenize(self, bytes): def tokenize(self, bytes):
return self.tokenizer.tokenize(bytes.decode("utf-8")) return self.tokenizer.tokenize(bytes.decode("utf-8"))
def newline_result(self):
self.n_from_end_tokens += 1
self.n_from_start_tokens += 1
self.n_to_end_tokens += 1
self.n_to_start_tokens +=1
return (Equal(self.n_from_start_tokens - 1,
self.n_from_end_tokens,
self.n_to_start_tokens - 1,
self.n_from_start_tokens),
[Token('\n')],
[Token('\n')])
def to_operations(self): def to_operations(self):
parmoves = [] parmoves = []
[print(diff) for diff in self.diff["diff"][0:5]]
for entry in self.diff["diff"]: for entry in self.diff["diff"]:
offset = entry["offset"] offset = entry["offset"]
if offset["to"]:
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"]
text = entry["text"] text = entry["text"]
# ignore empty diffs. They don't have any tokens # ignore empty diffs. They don't have any tokens
if len(text) == 0: if len(text) == 0:
@ -91,30 +117,58 @@ class DiffToOperationMap:
if entry["type"] == 0: if entry["type"] == 0:
yield from self.doEqual(text, offset) yield from self.doEqual(text, offset)
yield self.newline_result()
# a line included in the 'to' revision, but not in the 'from' revision # a line included in the 'to' revision, but not in the 'from' revision
elif entry["type"] == 1: elif entry["type"] == 1:
yield from self.doInsert(text, offset) yield from self.doInsert(text, offset)
yield self.newline_result()
# a line included in the 'from' revision, but not in the 'to' revision # a line included in the 'from' revision, but not in the 'to' revision
elif entry["type"] == 2: elif entry["type"] == 2:
yield from self.doDelete(text, offset) yield from self.doDelete(text, offset)
yield self.newline_result()
elif entry["type"] == 3: elif entry["type"] == 3:
yield from self.doHighlightRange( yield from self.doHighlightRange(
text, entry["highlightRanges"], offset text, entry["highlightRanges"], offset, entry["lineNumber"]
) )
yield self.newline_result()
elif entry["type"] == 4: elif entry["type"] == 4:
self.par_move_dict[entry["moveInfo"]["id"]] = entry self.par_move_dict[entry["moveInfo"]["id"]] = entry
# we need to count the tokens in the from revision so token index is correct linkId = entry["moveInfo"]["linkId"]
self.n_from_end_tokens += len(self.tokenize(entry['text'].encode())) if linkId in self.par_move_dict:
self.n_from_start_tokens += len(self.tokenize(entry['text'].encode())) yield from self.doParMove(entry, self.par_move_dict[linkId])
yield self.newline_result()
else:
# we need to count the tokens in the from revision so token index is correct
self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
self.n_from_start_tokens += len(
self.tokenize(entry["text"].encode())
)
elif entry["type"] == 5: elif entry["type"] == 5:
yield from self.doParMove(entry) linkId = entry["moveInfo"]["linkId"]
if linkId in self.par_move_dict:
yield from self.doParMove(self.par_move_dict[linkId], entry)
yield self.newline_result()
else:
self.par_move_dict[entry["moveInfo"]["id"]] = entry
# call doHighlightRange just to update the token indices
offset = {
"from": self.n_from_end_tokens,
"to": entry["offset"]["to"],
}
res = self.doHighlightRange(
entry["text"],
entry["highlightRanges"],
offset,
entry["lineNumber"],
update_idx="to",
)
list(res)
else: else:
# The 'type' isn't one of the known # The 'type' isn't one of the known
raise ValueError(d) raise ValueError(d)
@ -126,65 +180,84 @@ class DiffToOperationMap:
# strictly increasing, while the "from" segments should merely be # strictly increasing, while the "from" segments should merely be
# non-overlapping. # non-overlapping.
def doEqual(self, equal_segment, offset, update_idx="all", type=str):
if type is str:
equal_bytes = equal_segment.encode()
elif type is bytes:
equal_bytes = equal_segment
else:
raise ValueError(equal_segment)
def doEqual(self, equal_text, offset):
equal_bytes = equal_text.encode()
tokens = self.tokenize(equal_bytes) tokens = self.tokenize(equal_bytes)
n_tokens = len(tokens) n_tokens = len(tokens)
self.n_from_end_tokens = self.n_from_start_tokens + n_tokens n_from_end_tokens = self.n_from_start_tokens + n_tokens
self.n_to_end_tokens = self.n_to_start_tokens + n_tokens n_to_end_tokens = self.n_to_start_tokens + n_tokens
# we need to keep track of the to and from last end bytes # we need to keep track of the to and from last end bytes
self.from_last_end_bytes = offset["from"] + len(equal_bytes) self.from_last_end_bytes = offset["from"] + len(equal_bytes)
self.to_last_end_bytes = offset["to"] + len(equal_bytes) self.to_last_end_bytes = offset["to"] + len(equal_bytes)
yield ( yield (
Equal( Equal(
self.n_from_start_tokens, self.n_from_start_tokens,
self.n_from_end_tokens, n_from_end_tokens,
self.n_to_start_tokens, self.n_to_start_tokens,
self.n_to_end_tokens, n_to_end_tokens,
), ),
tokens, tokens,
tokens, tokens,
) )
self.n_from_start_tokens += n_tokens if update_idx in ["from", "all"]:
self.n_to_start_tokens += n_tokens self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
self.from_byte_token_index_map[
offset['from']
] = self.n_from_end_tokens
self.to_byte_token_index_map[offset['to']] = self.n_to_end_tokens
def doInsert(self, insert_text, offset): if update_idx in ["to", "all"]:
insert_bytes = insert_text.encode() self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
def doInsert(self, insert_segment, offset, update_idx="all", type=str):
if type is str:
insert_bytes = insert_segment.encode()
elif type is bytes:
insert_bytes = insert_segment
else:
raise ValueError(insert_segment)
tokens = self.tokenize(insert_bytes) tokens = self.tokenize(insert_bytes)
n_tokens = len(tokens) n_tokens = len(tokens)
self.n_to_end_tokens = self.n_to_start_tokens + n_tokens n_to_end_tokens = self.n_to_start_tokens + n_tokens
self.to_last_end_bytes = offset["to"] + len(insert_bytes) self.to_last_end_bytes = offset["to"] + len(insert_bytes)
yield ( yield (
Insert( Insert(
self.n_from_start_tokens, self.n_from_start_tokens,
self.n_from_start_tokens, self.n_from_start_tokens,
self.n_to_start_tokens, self.n_to_start_tokens,
self.n_to_end_tokens, n_to_end_tokens,
), ),
[], [],
tokens, tokens,
) )
# We have now used more of the "to" tokens. # We have now used more of the "to" tokens.
self.n_to_start_tokens += n_tokens if update_idx in ["to", "all"]:
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
self.to_byte_token_index_map[offset['to']] = self.n_to_end_tokens self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
def doDelete(self, delete_text, offset): def doDelete(self, delete_segment, offset, update_idx="all", type=str):
delete_bytes = delete_text.encode() if type is str:
delete_bytes = delete_segment.encode()
elif type is bytes:
delete_bytes = delete_segment
else:
raise ValueError(delete_segment)
tokens = self.tokenize(delete_bytes) tokens = self.tokenize(delete_bytes)
n_tokens = len(tokens) n_tokens = len(tokens)
self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
n_from_end_tokens = self.n_from_start_tokens + n_tokens
self.from_last_end_bytes = offset["from"] + len(delete_bytes) self.from_last_end_bytes = offset["from"] + len(delete_bytes)
yield ( yield (
Delete( Delete(
self.n_from_start_tokens, self.n_from_start_tokens,
self.n_from_end_tokens, n_from_end_tokens,
self.n_to_start_tokens, self.n_to_start_tokens,
self.n_to_start_tokens, self.n_to_start_tokens,
), ),
@ -192,19 +265,30 @@ class DiffToOperationMap:
[], [],
) )
# We have now used more of the "from" tokens. # We have now used more of the "from" tokens.
self.n_from_start_tokens += n_tokens if update_idx in ["from", "all"]:
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
self.from_byte_token_index_map[ self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
offset['from']
] = self.n_from_end_tokens
def doHighlightRange(self, highlight_bytes, highlightRanges, offset): def doHighlightRange(
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"
):
# The text field is an overlapping mix of both the from and to, # The text field is an overlapping mix of both the from and to,
# so we need to handle it highlight-by-highlight. # so we need to handle it highlight-by-highlight.
# there can be gaps between highlight segments. # there can be gaps between highlight segments.
# for instance, if a word is deleted from the middle of a line. # for instance, if a word is deleted from the middle of a line.
# we need to track that. # we need to track that.
highlight_bytes = highlight_text.encode()
highlight_end = 0 highlight_end = 0
# it's possible for offset['to'] to be null.
# we can get it from the line number?
if offset["to"] is None:
offset["to"] = self.from_byte_token_index_map[
find_greatest_le_key(lineNumber, self.from_byte_token_index_map)
]
highlight_offset = offset highlight_offset = offset
# note that diffs are token-level, but the indexes are byte-level # note that diffs are token-level, but the indexes are byte-level
@ -214,7 +298,9 @@ class DiffToOperationMap:
if highlight_start > highlight_end: if highlight_start > highlight_end:
equal_bytes = highlight_bytes[highlight_end:highlight_start] equal_bytes = highlight_bytes[highlight_end:highlight_start]
n_equal_bytes = len(equal_bytes) n_equal_bytes = len(equal_bytes)
yield from self.doEqual(equal_bytes, highlight_offset) yield from self.doEqual(
equal_bytes, highlight_offset, update_idx=update_idx, type=bytes
)
highlight_offset["from"] += n_equal_bytes highlight_offset["from"] += n_equal_bytes
highlight_offset["to"] += n_equal_bytes highlight_offset["to"] += n_equal_bytes
@ -222,11 +308,16 @@ class DiffToOperationMap:
highlight_end = highlight_start + highlightRange["length"] highlight_end = highlight_start + highlightRange["length"]
range_bytes = highlight_bytes[highlight_start:highlight_end] range_bytes = highlight_bytes[highlight_start:highlight_end]
n_range_bytes = len(range_bytes) n_range_bytes = len(range_bytes)
if highlightRange["type"] == 0: if highlightRange["type"] == 0:
yield from self.doInsert(range_bytes, highlight_offset) yield from self.doInsert(
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
)
highlight_offset["to"] += n_range_bytes highlight_offset["to"] += n_range_bytes
elif highlightRange["type"] == 1: elif highlightRange["type"] == 1:
yield from self.doDelete(range_bytes, highlight_offset) yield from self.doDelete(
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
)
highlight_offset["from"] += n_range_bytes highlight_offset["from"] += n_range_bytes
else: else:
raise Exception(entry) raise Exception(entry)
@ -234,39 +325,48 @@ class DiffToOperationMap:
# handle the rest of the line which is equal # handle the rest of the line which is equal
if highlight_end < len(highlight_bytes): if highlight_end < len(highlight_bytes):
range_bytes = highlight_bytes[highlight_end:] range_bytes = highlight_bytes[highlight_end:]
yield from self.doEqual(range_bytes, highlight_offset) yield from self.doEqual(range_bytes, highlight_offset, type=bytes)
def doParMove(self, to_diff):
def doParMove(self, from_diff, to_diff):
# the tricky part here is to put the tokens in the right spots. # the tricky part here is to put the tokens in the right spots.
from_diff = self.par_move_dict[to_diff["moveInfo"]["linkId"]]
from_byte_start = from_diff["offset"]["from"] from_byte_start = from_diff["offset"]["from"]
# as of python 3.7 dictionaries are in insertion order. So # as of python 3.7 dictionaries are in insertion order. So
# we can just find the first key that's greater # we can just find the first key that's greater
# since the paragraph is removed in the "from" version, the index it is removed from # since the paragraph is removed in the "from" version, the index it is removed from
# will be *after* the # will be *after* the
if from_byte_start >= self.from_last_end_bytes: # if the from paragraph is at the end if len(self.from_byte_token_index_map) > 0:
from_token_start = next(reversed(self.from_byte_token_index_map.values())) if (
from_byte_start >= self.from_last_end_bytes
): # if the from paragraph is at the end
from_token_start = next(
reversed(self.from_byte_token_index_map.values())
)
else:
key = find_greatest_le_key(
from_byte_start, self.from_byte_token_index_map
)
from_token_start = self.from_byte_token_index_map[key]
else: else:
key = find_greatest_le_key(from_byte_start, self.from_byte_token_index_map) from_token_start = 0
from_token_start = self.from_byte_token_index_map[key]
# get the to token index if len(self.to_byte_token_index_map) > 0:
to_byte_start = to_diff["offset"]["to"] # get the to token index
if to_byte_start >= self.to_last_end_bytes: to_byte_start = to_diff["offset"]["to"]
to_token_start = next(reversed(self.to_byte_token_index_map.values())) if to_byte_start >= self.to_last_end_bytes:
to_token_start = next(reversed(self.to_byte_token_index_map.values()))
else:
key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map)
to_token_start = self.to_byte_token_index_map[key]
else: else:
key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map) to_token_start = 0
to_token_start = self.to_byte_token_index_map[key]
# now we set the state and apply the highlights # now we set the state and apply the highlights
self.n_from_start_tokens = self.n_from_end_tokens = from_token_start self.n_from_start_tokens = self.n_from_end_tokens = from_token_start
self.n_to_start_tokens = self.n_to_end_tokens = to_token_start self.n_to_start_tokens = self.n_to_end_tokens = to_token_start
offset = {"from": from_byte_start, "to": to_byte_start} offset = {"from": from_byte_start, "to": to_byte_start}
yield from self.doHighlightRange( yield from self.doHighlightRange(
to_diff["text"], to_diff["highlightRanges"], offset to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
) )
@ -295,23 +395,32 @@ class WikiDiffMatcher:
# The diff has already been computed, but we need to incrementally # The diff has already been computed, but we need to incrementally
# retrieve it to recreate the behavior DiffState expects. # retrieve it to recreate the behavior DiffState expects.
diff = next(self.diffs) diff = next(self.diffs)
diffToOperationsMapper = DiffToOperationMap( diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)
diff, self.tokenizer
) diffops = list(zip(*diffToOperationsMapper.to_operations()))
(
if not diffops:
self.last_tokens = []
return [], [], []
diffops = (
operations, operations,
aseq, aseq,
bseq, bseq,
) = list(zip(*diffToOperationsMapper.to_operations())) ) = diffops
aseq = list(aseq) aseq = list(aseq)
# aseq can be out of order, we need to sort it by a1 index. # aseq/bseq can be out of order, we need to sort it by a1/b1 index.
indices = list(range(len(aseq))) indices = list(range(len(aseq)))
indices.sort(key = lambda i: operations[i].a1) indices.sort(key=lambda i: operations[i].a1)
aseq = [aseq[i] for i in indices] aseq = [aseq[i] for i in indices]
# bseq cannot be out of order since diffs are resolved in the order of aseq. bseq = list(bseq)
indices = list(range(len(bseq)))
indices.sort(key=lambda i: operations[i].b1)
bseq = [bseq[i] for i in indices]
self.last_tokens = list(chain.from_iterable(aseq)) self.last_tokens = list(chain.from_iterable(aseq))
tokens = list(chain.from_iterable(bseq)) tokens = list(chain.from_iterable(bseq))
self.previous_text = text self.previous_text = text