WIP: fixing bugs and adding newlines to output.
This commit is contained in:
parent
c4acc711d2
commit
cf1fb61a84
@ -1,2 +1,2 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys
|
uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys
|
||||||
|
@ -8,30 +8,36 @@ import pytest_asyncio
|
|||||||
from typing import List
|
from typing import List
|
||||||
from deltas import Delete, Equal, Insert, wikitext_split
|
from deltas import Delete, Equal, Insert, wikitext_split
|
||||||
from mwpersistence import Token
|
from mwpersistence import Token
|
||||||
|
|
||||||
from wiki_diff_matcher import WikiDiffMatcher
|
from wiki_diff_matcher import WikiDiffMatcher
|
||||||
|
|
||||||
|
@pytest_asyncio.fixture(scope="module", autouse=True)
|
||||||
@pytest_asyncio.fixture(scope="module")
|
|
||||||
async def start_stop_server():
|
async def start_stop_server():
|
||||||
|
print("starting server")
|
||||||
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
|
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
|
||||||
"wikidiff2_api.php",
|
"wikidiff2_api.php",
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE)
|
stderr=subprocess.PIPE)
|
||||||
|
# php needs a moment to actually start
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
yield proc
|
yield proc
|
||||||
stdout, stderr = await proc.communicate()
|
print("stopping server")
|
||||||
print(stdout.encode())
|
|
||||||
print(stderr.encode())
|
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
stdout, stderr = await proc.communicate()
|
||||||
|
print(stdout.decode())
|
||||||
|
print(stderr.decode())
|
||||||
|
|
||||||
def assert_equal_enough(tokens:List[Token], rev):
|
def assert_equal_enough(tokens:List[Token], rev):
|
||||||
# the tokens exclude newlines
|
# the tokens exclude newlines
|
||||||
# we allow extra whitespace at the beginning or end
|
# we allow extra whitespace at the beginning or end
|
||||||
token_doc = ''.join(str(t) for t in tokens).strip()
|
token_doc = ''.join(str(t) for t in tokens).strip()
|
||||||
rev_doc = rev.replace('\n','').strip()
|
while '\n\n' in token_doc:
|
||||||
|
token_doc = token_doc.replace('\n\n','\n')
|
||||||
|
while '\n\n' in rev:
|
||||||
|
rev = rev.replace('\n\n','\n').strip()
|
||||||
print(token_doc, file = open('token','w'))
|
print(token_doc, file = open('token','w'))
|
||||||
print(rev_doc, file = open('rev','w'))
|
print(rev, file = open('rev','w'))
|
||||||
assert token_doc == rev_doc
|
assert token_doc == rev
|
||||||
|
|
||||||
|
|
||||||
def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_tokens):
|
def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_tokens):
|
||||||
@ -70,15 +76,59 @@ def test_equality():
|
|||||||
diff_processor = matcher.processor()
|
diff_processor = matcher.processor()
|
||||||
ops, a, b = diff_processor.process(rev1)
|
ops, a, b = diff_processor.process(rev1)
|
||||||
ops, a, b = diff_processor.process(rev1 + " ")
|
ops, a, b = diff_processor.process(rev1 + " ")
|
||||||
assert len(ops) == 129
|
assert len(ops) == 258
|
||||||
for op in ops[:-1]:
|
for op in ops[:-2]:
|
||||||
assert isinstance(op, Equal)
|
print(op)
|
||||||
|
assert isinstance(op, Equal)
|
||||||
|
|
||||||
# note that the whitespace token does not result in a token according to wikitext_split
|
# note that the whitespace token does not result in a token according to wikitext_split
|
||||||
# compare the tokens based on the diffs to the baseline
|
# compare the tokens based on the diffs to the baseline
|
||||||
# whitespace differences are allowed
|
# whitespace differences are allowed
|
||||||
assert_equal_enough(b, rev1)
|
assert_equal_enough(b, rev1)
|
||||||
|
|
||||||
|
def test_highlight_range_3():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_highlight_3_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_highlight_3_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
def test_highlight_range_4():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_highlight_4_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_highlight_4_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
def test_complex_diff():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_complex_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_complex_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_highlight_range_unicode():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_unicode_highlight_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_unicode_highlight_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
|
||||||
def test_highlight_range():
|
def test_highlight_range():
|
||||||
rev1 = open("test/test_diff_revisions/1295229484_rangeedit0").read()
|
rev1 = open("test/test_diff_revisions/1295229484_rangeedit0").read()
|
||||||
rev2 = open("test/test_diff_revisions/1295229484_rangeedit1").read()
|
rev2 = open("test/test_diff_revisions/1295229484_rangeedit1").read()
|
||||||
@ -108,28 +158,38 @@ def test_delete():
|
|||||||
n_deleted_tokens = 0
|
n_deleted_tokens = 0
|
||||||
last_b2 = initial_equal_tokens
|
last_b2 = initial_equal_tokens
|
||||||
|
|
||||||
initial_equal_lines = 2
|
initial_equal_lines = 4
|
||||||
initial_equal_tokens = 12
|
initial_equal_tokens = 14
|
||||||
last_b2 = assert_correct_equal_section(ops,
|
last_b2 = assert_correct_equal_section(ops,
|
||||||
expected_equal_lines=initial_equal_lines,
|
expected_equal_lines=initial_equal_lines,
|
||||||
expected_equal_tokens=initial_equal_tokens)
|
expected_equal_tokens=initial_equal_tokens)
|
||||||
first_noninsert_token = initial_equal_tokens
|
first_noninsert_token = initial_equal_tokens
|
||||||
|
|
||||||
|
|
||||||
|
last_non_delete = False
|
||||||
|
idx = 0
|
||||||
for op in ops[initial_equal_lines:]:
|
for op in ops[initial_equal_lines:]:
|
||||||
|
idx += 1
|
||||||
|
# deletes are interleaved with Equal newlines.
|
||||||
if not isinstance(op, Delete):
|
if not isinstance(op, Delete):
|
||||||
first_nondelete_token = op.a1
|
if last_non_delete:
|
||||||
break
|
first_nondelete_token = op.a1
|
||||||
n_deletes += 1
|
break
|
||||||
n_deleted_tokens += op.a2 - last_b2
|
last_non_delete = True
|
||||||
last_b2 = op.a2
|
else:
|
||||||
|
last_non_delete = False
|
||||||
|
if last_non_delete:
|
||||||
|
n_deletes += 1
|
||||||
|
n_deleted_tokens += op.a2 - last_b2
|
||||||
|
last_b2 = op.a2
|
||||||
|
|
||||||
assert n_deletes == 2
|
assert n_deletes == 2
|
||||||
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 316
|
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
|
||||||
|
|
||||||
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + n_deletes:],
|
|
||||||
expected_equal_lines=126,
|
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
|
||||||
expected_equal_tokens=9323)
|
expected_equal_lines=252,
|
||||||
|
expected_equal_tokens=9765)
|
||||||
|
|
||||||
|
|
||||||
# first lets test that we properly build the operations.
|
# first lets test that we properly build the operations.
|
||||||
@ -144,8 +204,14 @@ def test_addition():
|
|||||||
# so they reflect the state of the text according to the diff processor
|
# so they reflect the state of the text according to the diff processor
|
||||||
ops, a, b = diff_processor.process(rev1)
|
ops, a, b = diff_processor.process(rev1)
|
||||||
|
|
||||||
|
even = True
|
||||||
for op in ops:
|
for op in ops:
|
||||||
assert isinstance(op, Insert)
|
if even:
|
||||||
|
assert isinstance(op, Insert)
|
||||||
|
even = False
|
||||||
|
else:
|
||||||
|
assert isinstance(op, Equal)
|
||||||
|
even = True
|
||||||
|
|
||||||
assert_equal_enough(b, rev1)
|
assert_equal_enough(b, rev1)
|
||||||
|
|
||||||
@ -155,21 +221,26 @@ def test_addition():
|
|||||||
assert_equal_enough(a, rev1)
|
assert_equal_enough(a, rev1)
|
||||||
assert_equal_enough(b, rev2)
|
assert_equal_enough(b, rev2)
|
||||||
ops = list(ops)
|
ops = list(ops)
|
||||||
initial_equal_lines = 128
|
initial_equal_lines = 256
|
||||||
initial_equal_tokens = 9359
|
initial_equal_tokens = 9487
|
||||||
last_b2 = assert_correct_equal_section(ops,
|
last_b2 = assert_correct_equal_section(ops,
|
||||||
expected_equal_lines=initial_equal_lines,
|
expected_equal_lines=initial_equal_lines,
|
||||||
expected_equal_tokens=initial_equal_tokens)
|
expected_equal_tokens=initial_equal_tokens)
|
||||||
|
last_non_insert = False
|
||||||
first_noninsert_token = None
|
first_noninsert_token = None
|
||||||
n_inserts = 0
|
n_inserts = 0
|
||||||
n_inserted_tokens = 0
|
n_inserted_tokens = 0
|
||||||
last_b2 = initial_equal_tokens
|
last_b2 = last_insert_b2 = initial_equal_tokens
|
||||||
|
idx = 0
|
||||||
|
print(ops[initial_equal_lines:])
|
||||||
for op in ops[initial_equal_lines:]:
|
for op in ops[initial_equal_lines:]:
|
||||||
n_inserts += 1
|
if isinstance(op, Insert):
|
||||||
n_inserted_tokens += op.b2 - last_b2
|
n_inserts += 1
|
||||||
|
n_inserted_tokens += op.b2 - op.b1
|
||||||
|
last_insert_b2 = op.b2
|
||||||
last_b2 = op.b2
|
last_b2 = op.b2
|
||||||
|
|
||||||
assert n_inserted_tokens == last_b2 - initial_equal_tokens == 292
|
assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
|
||||||
assert n_inserts == 2
|
assert n_inserts == 2
|
||||||
|
|
||||||
def test_paragraph_move():
|
def test_paragraph_move():
|
||||||
@ -195,6 +266,26 @@ def test_paragraph_move_and_change():
|
|||||||
# so they reflect the state of the text according to the diff processor
|
# so they reflect the state of the text according to the diff processor
|
||||||
ops, a, b = diff_processor.process(rev1)
|
ops, a, b = diff_processor.process(rev1)
|
||||||
ops, a, b = diff_processor.process(rev2)
|
ops, a, b = diff_processor.process(rev2)
|
||||||
assert_equal_enough(b, rev2)
|
|
||||||
assert_equal_enough(a, rev1)
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
# slow test
|
||||||
|
def test_diff_consistency():
|
||||||
|
from mwxml import Dump
|
||||||
|
stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout
|
||||||
|
|
||||||
|
dump = Dump.from_file(stream)
|
||||||
|
for page in dump:
|
||||||
|
revisions = [rev.text for rev in page if rev.text]
|
||||||
|
|
||||||
|
matcher = WikiDiffMatcher(revisions)
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
last_rev = ""
|
||||||
|
for rev in revisions:
|
||||||
|
print(rev, file=open("test_unicode_highlight_to",'w'))
|
||||||
|
print(last_rev, file=open("test_unicode_highlight_from",'w'))
|
||||||
|
ops, a, b = diff_processor.process(rev)
|
||||||
|
#assert_equal_enough(a, last_rev)
|
||||||
|
|
||||||
|
assert_equal_enough(b, rev)
|
||||||
|
last_rev = rev
|
||||||
|
@ -4,13 +4,15 @@ from collections import namedtuple
|
|||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Dict, Generator, List, Optional, Tuple
|
from typing import Dict, Generator, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from deltas import Equal, Insert, Delete, DiffEngine, Operation, RegexTokenizer, tokenizers
|
from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token,
|
||||||
|
RegexTokenizer, tokenizers)
|
||||||
|
|
||||||
TOKENIZER = tokenizers.wikitext_split
|
TOKENIZER = tokenizers.wikitext_split
|
||||||
|
|
||||||
|
|
||||||
def find_greatest_le_key(target_key, data_dict):
|
def find_greatest_le_key(target_key, data_dict):
|
||||||
|
found_key = None
|
||||||
for key in data_dict: # Iterates over keys in insertion order (which is sorted)
|
for key in data_dict: # Iterates over keys in insertion order (which is sorted)
|
||||||
if key <= target_key:
|
if key <= target_key:
|
||||||
found_key = (
|
found_key = (
|
||||||
@ -62,6 +64,8 @@ class DiffToOperationMap:
|
|||||||
self.from_last_end_bytes = 0
|
self.from_last_end_bytes = 0
|
||||||
self.from_last_to_bytes = 0
|
self.from_last_to_bytes = 0
|
||||||
self.n_from_start_tokens = 0
|
self.n_from_start_tokens = 0
|
||||||
|
self.n_from_end_tokens = 0
|
||||||
|
self.n_from_start_tokens = 0
|
||||||
self.n_to_start_tokens = 0
|
self.n_to_start_tokens = 0
|
||||||
self.last_to_start_line = 0
|
self.last_to_start_line = 0
|
||||||
self.last_from_start_line = 0
|
self.last_from_start_line = 0
|
||||||
@ -73,13 +77,35 @@ class DiffToOperationMap:
|
|||||||
self.to_byte_token_index_map: Dict[int, int] = {}
|
self.to_byte_token_index_map: Dict[int, int] = {}
|
||||||
self.par_move_dict = {}
|
self.par_move_dict = {}
|
||||||
|
|
||||||
|
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
|
||||||
|
self.to_linenumber_bytes_map = {}
|
||||||
|
|
||||||
def tokenize(self, bytes):
|
def tokenize(self, bytes):
|
||||||
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
||||||
|
|
||||||
|
def newline_result(self):
|
||||||
|
self.n_from_end_tokens += 1
|
||||||
|
self.n_from_start_tokens += 1
|
||||||
|
self.n_to_end_tokens += 1
|
||||||
|
self.n_to_start_tokens +=1
|
||||||
|
|
||||||
|
return (Equal(self.n_from_start_tokens - 1,
|
||||||
|
self.n_from_end_tokens,
|
||||||
|
self.n_to_start_tokens - 1,
|
||||||
|
self.n_from_start_tokens),
|
||||||
|
[Token('\n')],
|
||||||
|
[Token('\n')])
|
||||||
|
|
||||||
|
|
||||||
def to_operations(self):
|
def to_operations(self):
|
||||||
parmoves = []
|
parmoves = []
|
||||||
|
[print(diff) for diff in self.diff["diff"][0:5]]
|
||||||
for entry in self.diff["diff"]:
|
for entry in self.diff["diff"]:
|
||||||
offset = entry["offset"]
|
offset = entry["offset"]
|
||||||
|
|
||||||
|
if offset["to"]:
|
||||||
|
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"]
|
||||||
|
|
||||||
text = entry["text"]
|
text = entry["text"]
|
||||||
# ignore empty diffs. They don't have any tokens
|
# ignore empty diffs. They don't have any tokens
|
||||||
if len(text) == 0:
|
if len(text) == 0:
|
||||||
@ -91,30 +117,58 @@ class DiffToOperationMap:
|
|||||||
|
|
||||||
if entry["type"] == 0:
|
if entry["type"] == 0:
|
||||||
yield from self.doEqual(text, offset)
|
yield from self.doEqual(text, offset)
|
||||||
|
yield self.newline_result()
|
||||||
|
|
||||||
# a line included in the 'to' revision, but not in the 'from' revision
|
# a line included in the 'to' revision, but not in the 'from' revision
|
||||||
elif entry["type"] == 1:
|
elif entry["type"] == 1:
|
||||||
yield from self.doInsert(text, offset)
|
yield from self.doInsert(text, offset)
|
||||||
|
yield self.newline_result()
|
||||||
|
|
||||||
# a line included in the 'from' revision, but not in the 'to' revision
|
# a line included in the 'from' revision, but not in the 'to' revision
|
||||||
elif entry["type"] == 2:
|
elif entry["type"] == 2:
|
||||||
yield from self.doDelete(text, offset)
|
yield from self.doDelete(text, offset)
|
||||||
|
yield self.newline_result()
|
||||||
|
|
||||||
elif entry["type"] == 3:
|
elif entry["type"] == 3:
|
||||||
yield from self.doHighlightRange(
|
yield from self.doHighlightRange(
|
||||||
text, entry["highlightRanges"], offset
|
text, entry["highlightRanges"], offset, entry["lineNumber"]
|
||||||
)
|
)
|
||||||
|
yield self.newline_result()
|
||||||
|
|
||||||
elif entry["type"] == 4:
|
elif entry["type"] == 4:
|
||||||
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
||||||
|
|
||||||
# we need to count the tokens in the from revision so token index is correct
|
linkId = entry["moveInfo"]["linkId"]
|
||||||
self.n_from_end_tokens += len(self.tokenize(entry['text'].encode()))
|
if linkId in self.par_move_dict:
|
||||||
self.n_from_start_tokens += len(self.tokenize(entry['text'].encode()))
|
yield from self.doParMove(entry, self.par_move_dict[linkId])
|
||||||
|
yield self.newline_result()
|
||||||
|
else:
|
||||||
|
# we need to count the tokens in the from revision so token index is correct
|
||||||
|
self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
|
||||||
|
self.n_from_start_tokens += len(
|
||||||
|
self.tokenize(entry["text"].encode())
|
||||||
|
)
|
||||||
|
|
||||||
elif entry["type"] == 5:
|
elif entry["type"] == 5:
|
||||||
yield from self.doParMove(entry)
|
linkId = entry["moveInfo"]["linkId"]
|
||||||
|
if linkId in self.par_move_dict:
|
||||||
|
yield from self.doParMove(self.par_move_dict[linkId], entry)
|
||||||
|
yield self.newline_result()
|
||||||
|
else:
|
||||||
|
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
||||||
|
# call doHighlightRange just to update the token indices
|
||||||
|
offset = {
|
||||||
|
"from": self.n_from_end_tokens,
|
||||||
|
"to": entry["offset"]["to"],
|
||||||
|
}
|
||||||
|
res = self.doHighlightRange(
|
||||||
|
entry["text"],
|
||||||
|
entry["highlightRanges"],
|
||||||
|
offset,
|
||||||
|
entry["lineNumber"],
|
||||||
|
update_idx="to",
|
||||||
|
)
|
||||||
|
list(res)
|
||||||
else:
|
else:
|
||||||
# The 'type' isn't one of the known
|
# The 'type' isn't one of the known
|
||||||
raise ValueError(d)
|
raise ValueError(d)
|
||||||
@ -126,65 +180,84 @@ class DiffToOperationMap:
|
|||||||
# strictly increasing, while the "from" segments should merely be
|
# strictly increasing, while the "from" segments should merely be
|
||||||
# non-overlapping.
|
# non-overlapping.
|
||||||
|
|
||||||
|
def doEqual(self, equal_segment, offset, update_idx="all", type=str):
|
||||||
|
if type is str:
|
||||||
|
equal_bytes = equal_segment.encode()
|
||||||
|
elif type is bytes:
|
||||||
|
equal_bytes = equal_segment
|
||||||
|
else:
|
||||||
|
raise ValueError(equal_segment)
|
||||||
|
|
||||||
def doEqual(self, equal_text, offset):
|
|
||||||
equal_bytes = equal_text.encode()
|
|
||||||
tokens = self.tokenize(equal_bytes)
|
tokens = self.tokenize(equal_bytes)
|
||||||
n_tokens = len(tokens)
|
n_tokens = len(tokens)
|
||||||
self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
||||||
self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
||||||
# we need to keep track of the to and from last end bytes
|
# we need to keep track of the to and from last end bytes
|
||||||
self.from_last_end_bytes = offset["from"] + len(equal_bytes)
|
self.from_last_end_bytes = offset["from"] + len(equal_bytes)
|
||||||
self.to_last_end_bytes = offset["to"] + len(equal_bytes)
|
self.to_last_end_bytes = offset["to"] + len(equal_bytes)
|
||||||
yield (
|
yield (
|
||||||
Equal(
|
Equal(
|
||||||
self.n_from_start_tokens,
|
self.n_from_start_tokens,
|
||||||
self.n_from_end_tokens,
|
n_from_end_tokens,
|
||||||
self.n_to_start_tokens,
|
self.n_to_start_tokens,
|
||||||
self.n_to_end_tokens,
|
n_to_end_tokens,
|
||||||
),
|
),
|
||||||
tokens,
|
tokens,
|
||||||
tokens,
|
tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.n_from_start_tokens += n_tokens
|
if update_idx in ["from", "all"]:
|
||||||
self.n_to_start_tokens += n_tokens
|
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
|
||||||
self.from_byte_token_index_map[
|
|
||||||
offset['from']
|
|
||||||
] = self.n_from_end_tokens
|
|
||||||
self.to_byte_token_index_map[offset['to']] = self.n_to_end_tokens
|
|
||||||
|
|
||||||
def doInsert(self, insert_text, offset):
|
if update_idx in ["to", "all"]:
|
||||||
insert_bytes = insert_text.encode()
|
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
|
||||||
|
|
||||||
|
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
|
||||||
|
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
|
||||||
|
|
||||||
|
def doInsert(self, insert_segment, offset, update_idx="all", type=str):
|
||||||
|
if type is str:
|
||||||
|
insert_bytes = insert_segment.encode()
|
||||||
|
elif type is bytes:
|
||||||
|
insert_bytes = insert_segment
|
||||||
|
else:
|
||||||
|
raise ValueError(insert_segment)
|
||||||
tokens = self.tokenize(insert_bytes)
|
tokens = self.tokenize(insert_bytes)
|
||||||
n_tokens = len(tokens)
|
n_tokens = len(tokens)
|
||||||
self.n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
||||||
self.to_last_end_bytes = offset["to"] + len(insert_bytes)
|
self.to_last_end_bytes = offset["to"] + len(insert_bytes)
|
||||||
yield (
|
yield (
|
||||||
Insert(
|
Insert(
|
||||||
self.n_from_start_tokens,
|
self.n_from_start_tokens,
|
||||||
self.n_from_start_tokens,
|
self.n_from_start_tokens,
|
||||||
self.n_to_start_tokens,
|
self.n_to_start_tokens,
|
||||||
self.n_to_end_tokens,
|
n_to_end_tokens,
|
||||||
),
|
),
|
||||||
[],
|
[],
|
||||||
tokens,
|
tokens,
|
||||||
)
|
)
|
||||||
# We have now used more of the "to" tokens.
|
# We have now used more of the "to" tokens.
|
||||||
self.n_to_start_tokens += n_tokens
|
if update_idx in ["to", "all"]:
|
||||||
|
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
|
||||||
|
|
||||||
self.to_byte_token_index_map[offset['to']] = self.n_to_end_tokens
|
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
|
||||||
|
|
||||||
def doDelete(self, delete_text, offset):
|
def doDelete(self, delete_segment, offset, update_idx="all", type=str):
|
||||||
delete_bytes = delete_text.encode()
|
if type is str:
|
||||||
|
delete_bytes = delete_segment.encode()
|
||||||
|
elif type is bytes:
|
||||||
|
delete_bytes = delete_segment
|
||||||
|
else:
|
||||||
|
raise ValueError(delete_segment)
|
||||||
tokens = self.tokenize(delete_bytes)
|
tokens = self.tokenize(delete_bytes)
|
||||||
n_tokens = len(tokens)
|
n_tokens = len(tokens)
|
||||||
self.n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
|
||||||
|
n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
||||||
self.from_last_end_bytes = offset["from"] + len(delete_bytes)
|
self.from_last_end_bytes = offset["from"] + len(delete_bytes)
|
||||||
yield (
|
yield (
|
||||||
Delete(
|
Delete(
|
||||||
self.n_from_start_tokens,
|
self.n_from_start_tokens,
|
||||||
self.n_from_end_tokens,
|
n_from_end_tokens,
|
||||||
self.n_to_start_tokens,
|
self.n_to_start_tokens,
|
||||||
self.n_to_start_tokens,
|
self.n_to_start_tokens,
|
||||||
),
|
),
|
||||||
@ -192,19 +265,30 @@ class DiffToOperationMap:
|
|||||||
[],
|
[],
|
||||||
)
|
)
|
||||||
# We have now used more of the "from" tokens.
|
# We have now used more of the "from" tokens.
|
||||||
self.n_from_start_tokens += n_tokens
|
if update_idx in ["from", "all"]:
|
||||||
|
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
|
||||||
|
|
||||||
self.from_byte_token_index_map[
|
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
|
||||||
offset['from']
|
|
||||||
] = self.n_from_end_tokens
|
|
||||||
|
|
||||||
def doHighlightRange(self, highlight_bytes, highlightRanges, offset):
|
def doHighlightRange(
|
||||||
|
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"
|
||||||
|
):
|
||||||
# The text field is an overlapping mix of both the from and to,
|
# The text field is an overlapping mix of both the from and to,
|
||||||
# so we need to handle it highlight-by-highlight.
|
# so we need to handle it highlight-by-highlight.
|
||||||
# there can be gaps between highlight segments.
|
# there can be gaps between highlight segments.
|
||||||
# for instance, if a word is deleted from the middle of a line.
|
# for instance, if a word is deleted from the middle of a line.
|
||||||
# we need to track that.
|
# we need to track that.
|
||||||
|
highlight_bytes = highlight_text.encode()
|
||||||
highlight_end = 0
|
highlight_end = 0
|
||||||
|
|
||||||
|
# it's possible for offset['to'] to be null.
|
||||||
|
# we can get it from the line number?
|
||||||
|
|
||||||
|
if offset["to"] is None:
|
||||||
|
offset["to"] = self.from_byte_token_index_map[
|
||||||
|
find_greatest_le_key(lineNumber, self.from_byte_token_index_map)
|
||||||
|
]
|
||||||
|
|
||||||
highlight_offset = offset
|
highlight_offset = offset
|
||||||
# note that diffs are token-level, but the indexes are byte-level
|
# note that diffs are token-level, but the indexes are byte-level
|
||||||
|
|
||||||
@ -214,7 +298,9 @@ class DiffToOperationMap:
|
|||||||
if highlight_start > highlight_end:
|
if highlight_start > highlight_end:
|
||||||
equal_bytes = highlight_bytes[highlight_end:highlight_start]
|
equal_bytes = highlight_bytes[highlight_end:highlight_start]
|
||||||
n_equal_bytes = len(equal_bytes)
|
n_equal_bytes = len(equal_bytes)
|
||||||
yield from self.doEqual(equal_bytes, highlight_offset)
|
yield from self.doEqual(
|
||||||
|
equal_bytes, highlight_offset, update_idx=update_idx, type=bytes
|
||||||
|
)
|
||||||
highlight_offset["from"] += n_equal_bytes
|
highlight_offset["from"] += n_equal_bytes
|
||||||
highlight_offset["to"] += n_equal_bytes
|
highlight_offset["to"] += n_equal_bytes
|
||||||
|
|
||||||
@ -222,11 +308,16 @@ class DiffToOperationMap:
|
|||||||
highlight_end = highlight_start + highlightRange["length"]
|
highlight_end = highlight_start + highlightRange["length"]
|
||||||
range_bytes = highlight_bytes[highlight_start:highlight_end]
|
range_bytes = highlight_bytes[highlight_start:highlight_end]
|
||||||
n_range_bytes = len(range_bytes)
|
n_range_bytes = len(range_bytes)
|
||||||
|
|
||||||
if highlightRange["type"] == 0:
|
if highlightRange["type"] == 0:
|
||||||
yield from self.doInsert(range_bytes, highlight_offset)
|
yield from self.doInsert(
|
||||||
|
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
|
||||||
|
)
|
||||||
highlight_offset["to"] += n_range_bytes
|
highlight_offset["to"] += n_range_bytes
|
||||||
elif highlightRange["type"] == 1:
|
elif highlightRange["type"] == 1:
|
||||||
yield from self.doDelete(range_bytes, highlight_offset)
|
yield from self.doDelete(
|
||||||
|
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
|
||||||
|
)
|
||||||
highlight_offset["from"] += n_range_bytes
|
highlight_offset["from"] += n_range_bytes
|
||||||
else:
|
else:
|
||||||
raise Exception(entry)
|
raise Exception(entry)
|
||||||
@ -234,39 +325,48 @@ class DiffToOperationMap:
|
|||||||
# handle the rest of the line which is equal
|
# handle the rest of the line which is equal
|
||||||
if highlight_end < len(highlight_bytes):
|
if highlight_end < len(highlight_bytes):
|
||||||
range_bytes = highlight_bytes[highlight_end:]
|
range_bytes = highlight_bytes[highlight_end:]
|
||||||
yield from self.doEqual(range_bytes, highlight_offset)
|
yield from self.doEqual(range_bytes, highlight_offset, type=bytes)
|
||||||
|
|
||||||
def doParMove(self, to_diff):
|
|
||||||
|
|
||||||
|
def doParMove(self, from_diff, to_diff):
|
||||||
# the tricky part here is to put the tokens in the right spots.
|
# the tricky part here is to put the tokens in the right spots.
|
||||||
from_diff = self.par_move_dict[to_diff["moveInfo"]["linkId"]]
|
|
||||||
|
|
||||||
from_byte_start = from_diff["offset"]["from"]
|
from_byte_start = from_diff["offset"]["from"]
|
||||||
# as of python 3.7 dictionaries are in insertion order. So
|
# as of python 3.7 dictionaries are in insertion order. So
|
||||||
# we can just find the first key that's greater
|
# we can just find the first key that's greater
|
||||||
|
|
||||||
# since the paragraph is removed in the "from" version, the index it is removed from
|
# since the paragraph is removed in the "from" version, the index it is removed from
|
||||||
# will be *after* the
|
# will be *after* the
|
||||||
if from_byte_start >= self.from_last_end_bytes: # if the from paragraph is at the end
|
if len(self.from_byte_token_index_map) > 0:
|
||||||
from_token_start = next(reversed(self.from_byte_token_index_map.values()))
|
if (
|
||||||
|
from_byte_start >= self.from_last_end_bytes
|
||||||
|
): # if the from paragraph is at the end
|
||||||
|
from_token_start = next(
|
||||||
|
reversed(self.from_byte_token_index_map.values())
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
key = find_greatest_le_key(
|
||||||
|
from_byte_start, self.from_byte_token_index_map
|
||||||
|
)
|
||||||
|
from_token_start = self.from_byte_token_index_map[key]
|
||||||
else:
|
else:
|
||||||
key = find_greatest_le_key(from_byte_start, self.from_byte_token_index_map)
|
from_token_start = 0
|
||||||
from_token_start = self.from_byte_token_index_map[key]
|
|
||||||
|
|
||||||
# get the to token index
|
if len(self.to_byte_token_index_map) > 0:
|
||||||
to_byte_start = to_diff["offset"]["to"]
|
# get the to token index
|
||||||
if to_byte_start >= self.to_last_end_bytes:
|
to_byte_start = to_diff["offset"]["to"]
|
||||||
to_token_start = next(reversed(self.to_byte_token_index_map.values()))
|
if to_byte_start >= self.to_last_end_bytes:
|
||||||
|
to_token_start = next(reversed(self.to_byte_token_index_map.values()))
|
||||||
|
else:
|
||||||
|
key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map)
|
||||||
|
to_token_start = self.to_byte_token_index_map[key]
|
||||||
else:
|
else:
|
||||||
key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map)
|
to_token_start = 0
|
||||||
to_token_start = self.to_byte_token_index_map[key]
|
|
||||||
|
|
||||||
# now we set the state and apply the highlights
|
# now we set the state and apply the highlights
|
||||||
self.n_from_start_tokens = self.n_from_end_tokens = from_token_start
|
self.n_from_start_tokens = self.n_from_end_tokens = from_token_start
|
||||||
self.n_to_start_tokens = self.n_to_end_tokens = to_token_start
|
self.n_to_start_tokens = self.n_to_end_tokens = to_token_start
|
||||||
offset = {"from": from_byte_start, "to": to_byte_start}
|
offset = {"from": from_byte_start, "to": to_byte_start}
|
||||||
yield from self.doHighlightRange(
|
yield from self.doHighlightRange(
|
||||||
to_diff["text"], to_diff["highlightRanges"], offset
|
to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -295,23 +395,32 @@ class WikiDiffMatcher:
|
|||||||
# The diff has already been computed, but we need to incrementally
|
# The diff has already been computed, but we need to incrementally
|
||||||
# retrieve it to recreate the behavior DiffState expects.
|
# retrieve it to recreate the behavior DiffState expects.
|
||||||
diff = next(self.diffs)
|
diff = next(self.diffs)
|
||||||
diffToOperationsMapper = DiffToOperationMap(
|
diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)
|
||||||
diff, self.tokenizer
|
|
||||||
)
|
diffops = list(zip(*diffToOperationsMapper.to_operations()))
|
||||||
(
|
|
||||||
|
if not diffops:
|
||||||
|
self.last_tokens = []
|
||||||
|
return [], [], []
|
||||||
|
|
||||||
|
diffops = (
|
||||||
operations,
|
operations,
|
||||||
aseq,
|
aseq,
|
||||||
bseq,
|
bseq,
|
||||||
) = list(zip(*diffToOperationsMapper.to_operations()))
|
) = diffops
|
||||||
|
|
||||||
aseq = list(aseq)
|
aseq = list(aseq)
|
||||||
|
|
||||||
# aseq can be out of order, we need to sort it by a1 index.
|
# aseq/bseq can be out of order, we need to sort it by a1/b1 index.
|
||||||
indices = list(range(len(aseq)))
|
indices = list(range(len(aseq)))
|
||||||
indices.sort(key = lambda i: operations[i].a1)
|
indices.sort(key=lambda i: operations[i].a1)
|
||||||
aseq = [aseq[i] for i in indices]
|
aseq = [aseq[i] for i in indices]
|
||||||
|
|
||||||
# bseq cannot be out of order since diffs are resolved in the order of aseq.
|
bseq = list(bseq)
|
||||||
|
indices = list(range(len(bseq)))
|
||||||
|
indices.sort(key=lambda i: operations[i].b1)
|
||||||
|
bseq = [bseq[i] for i in indices]
|
||||||
|
|
||||||
self.last_tokens = list(chain.from_iterable(aseq))
|
self.last_tokens = list(chain.from_iterable(aseq))
|
||||||
tokens = list(chain.from_iterable(bseq))
|
tokens = list(chain.from_iterable(bseq))
|
||||||
self.previous_text = text
|
self.previous_text = text
|
||||||
|
Loading…
Reference in New Issue
Block a user