almost there. working out edge cases.
This commit is contained in:
parent
cf1fb61a84
commit
4654911533
@ -12,6 +12,7 @@ dependencies = [
|
||||
"mwtypes>=0.4.0",
|
||||
"mwxml>=0.3.6",
|
||||
"pyarrow>=20.0.0",
|
||||
"sortedcontainers>=2.4.0",
|
||||
"yamlconf>=0.2.6",
|
||||
]
|
||||
|
||||
@ -22,6 +23,7 @@ deltas = { git = "https://github.com/groceryheist/deltas" }
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"ipython>=8.18.1",
|
||||
"pandas>=2.1.0",
|
||||
"pytest>=8.4.1",
|
||||
"pytest-asyncio>=1.0.0",
|
||||
|
@ -2,7 +2,7 @@
|
||||
import asyncio
|
||||
import subprocess
|
||||
from functools import partial
|
||||
|
||||
import re
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from typing import List
|
||||
@ -30,11 +30,10 @@ async def start_stop_server():
|
||||
def assert_equal_enough(tokens:List[Token], rev):
|
||||
# the tokens exclude newlines
|
||||
# we allow extra whitespace at the beginning or end
|
||||
token_doc = ''.join(str(t) for t in tokens).strip()
|
||||
while '\n\n' in token_doc:
|
||||
token_doc = token_doc.replace('\n\n','\n')
|
||||
while '\n\n' in rev:
|
||||
rev = rev.replace('\n\n','\n').strip()
|
||||
token_doc = ''.join(str(t) for t in tokens)
|
||||
token_doc = re.sub(r'\s+', ' ', token_doc).strip()
|
||||
rev = re.sub(r'\s+', ' ', rev).strip()
|
||||
|
||||
print(token_doc, file = open('token','w'))
|
||||
print(rev, file = open('rev','w'))
|
||||
assert token_doc == rev
|
||||
@ -63,7 +62,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token
|
||||
if first_unequal_token is None:
|
||||
first_unequal_token = ops[-1].b2
|
||||
|
||||
|
||||
assert n_equal_lines == expected_equal_lines
|
||||
# check that there are no gaps and the number is as expected
|
||||
assert initial_equal_tokens == last_b2 - ops[0].b1 == first_unequal_token - ops[0].b1 == expected_equal_tokens
|
||||
@ -76,9 +74,8 @@ def test_equality():
|
||||
diff_processor = matcher.processor()
|
||||
ops, a, b = diff_processor.process(rev1)
|
||||
ops, a, b = diff_processor.process(rev1 + " ")
|
||||
assert len(ops) == 258
|
||||
assert len(ops) == 257
|
||||
for op in ops[:-2]:
|
||||
print(op)
|
||||
assert isinstance(op, Equal)
|
||||
|
||||
# note that the whitespace token does not result in a token according to wikitext_split
|
||||
@ -152,44 +149,48 @@ def test_delete():
|
||||
assert_equal_enough(b, rev2)
|
||||
assert_equal_enough(a, rev1)
|
||||
|
||||
initial_equal_tokens = 0
|
||||
first_nondelete_token = None
|
||||
n_deletes = 0
|
||||
n_deleted_tokens = 0
|
||||
last_b2 = initial_equal_tokens
|
||||
initial_equal_lines = 256
|
||||
initial_equal_tokens = 9911
|
||||
for i, op in enumerate(ops):
|
||||
if initial_equal_lines > 0:
|
||||
assert isinstance(op, Equal)
|
||||
else:
|
||||
break
|
||||
initial_equal_lines -= 1
|
||||
|
||||
assert initial_equal_lines == 0
|
||||
assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens
|
||||
|
||||
initial_equal_lines = 4
|
||||
initial_equal_tokens = 14
|
||||
last_b2 = assert_correct_equal_section(ops,
|
||||
expected_equal_lines=initial_equal_lines,
|
||||
expected_equal_tokens=initial_equal_tokens)
|
||||
first_noninsert_token = initial_equal_tokens
|
||||
|
||||
|
||||
last_non_delete = False
|
||||
last_delete = False
|
||||
last_insert = False
|
||||
idx = 0
|
||||
n_non_delete = 0
|
||||
|
||||
last_delete_idx = 0
|
||||
for op in ops[initial_equal_lines:]:
|
||||
idx += 1
|
||||
# deletes are interleaved with Equal newlines.
|
||||
if not isinstance(op, Delete):
|
||||
if last_non_delete:
|
||||
first_nondelete_token = op.a1
|
||||
break
|
||||
last_non_delete = True
|
||||
else:
|
||||
last_non_delete = False
|
||||
if last_non_delete:
|
||||
if isinstance(op, Delete):
|
||||
n_deletes += 1
|
||||
n_deleted_tokens += op.a2 - last_b2
|
||||
last_b2 = op.a2
|
||||
n_deleted_tokens += op.a2 - op.a1
|
||||
last_delete = True
|
||||
last_delete_idx = idx
|
||||
# we need to add back a newline when we have a delete
|
||||
else:
|
||||
n_non_delete += 1
|
||||
if not last_delete and first_nondelete_token is None:
|
||||
first_nondelete_token = op.a1
|
||||
|
||||
assert n_deletes == 2
|
||||
assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317
|
||||
if n_non_delete:
|
||||
last_b2 = op.b2
|
||||
|
||||
|
||||
last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:],
|
||||
expected_equal_lines=252,
|
||||
expected_equal_tokens=9765)
|
||||
assert n_deletes == 4
|
||||
assert n_deleted_tokens == 320
|
||||
assert idx == len(ops)
|
||||
|
||||
|
||||
# first lets test that we properly build the operations.
|
||||
@ -204,14 +205,8 @@ def test_addition():
|
||||
# so they reflect the state of the text according to the diff processor
|
||||
ops, a, b = diff_processor.process(rev1)
|
||||
|
||||
even = True
|
||||
for op in ops:
|
||||
if even:
|
||||
assert isinstance(op, Insert)
|
||||
even = False
|
||||
else:
|
||||
assert isinstance(op, Equal)
|
||||
even = True
|
||||
assert isinstance(op, Insert)
|
||||
|
||||
assert_equal_enough(b, rev1)
|
||||
|
||||
@ -221,8 +216,8 @@ def test_addition():
|
||||
assert_equal_enough(a, rev1)
|
||||
assert_equal_enough(b, rev2)
|
||||
ops = list(ops)
|
||||
initial_equal_lines = 256
|
||||
initial_equal_tokens = 9487
|
||||
initial_equal_lines = 255
|
||||
initial_equal_tokens = 9614
|
||||
last_b2 = assert_correct_equal_section(ops,
|
||||
expected_equal_lines=initial_equal_lines,
|
||||
expected_equal_tokens=initial_equal_tokens)
|
||||
@ -232,16 +227,21 @@ def test_addition():
|
||||
n_inserted_tokens = 0
|
||||
last_b2 = last_insert_b2 = initial_equal_tokens
|
||||
idx = 0
|
||||
print(ops[initial_equal_lines:])
|
||||
|
||||
last_insert = False
|
||||
for op in ops[initial_equal_lines:]:
|
||||
if isinstance(op, Insert):
|
||||
n_inserts += 1
|
||||
n_inserted_tokens += op.b2 - op.b1
|
||||
last_insert_b2 = op.b2
|
||||
last_insert = True
|
||||
elif last_insert:
|
||||
assert isinstance(op, Equal)
|
||||
|
||||
last_b2 = op.b2
|
||||
|
||||
assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293
|
||||
assert n_inserts == 2
|
||||
assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296
|
||||
assert n_inserts == 4
|
||||
|
||||
def test_paragraph_move():
|
||||
rev1 = open("test/test_diff_revisions/1295229484").read()
|
||||
@ -269,6 +269,63 @@ def test_paragraph_move_and_change():
|
||||
assert_equal_enough(a, rev1)
|
||||
assert_equal_enough(b, rev2)
|
||||
|
||||
def test_infobox():
|
||||
rev1 = open("test/test_diff_revisions/test_infobox_from").read()
|
||||
rev2 = open("test/test_diff_revisions/test_infobox_to").read()
|
||||
matcher = WikiDiffMatcher([rev1,rev2])
|
||||
diff_processor = matcher.processor()
|
||||
|
||||
# note that a and b are constructed from the diffs.
|
||||
# so they reflect the state of the text according to the diff processor
|
||||
ops, a, b = diff_processor.process(rev1)
|
||||
ops, a, b = diff_processor.process(rev2)
|
||||
assert_equal_enough(b, rev2)
|
||||
assert_equal_enough(a, rev1)
|
||||
|
||||
def test_leading_whitespace():
|
||||
rev1 = open("test/test_diff_revisions/test_leading_ws_from").read()
|
||||
rev2 = open("test/test_diff_revisions/test_leading_ws_to").read()
|
||||
matcher = WikiDiffMatcher([rev1,rev2])
|
||||
diff_processor = matcher.processor()
|
||||
|
||||
# note that a and b are constructed from the diffs.
|
||||
# so they reflect the state of the text according to the diff processor
|
||||
ops, a, b = diff_processor.process(rev1)
|
||||
ops, a, b = diff_processor.process(rev2)
|
||||
assert_equal_enough(b, rev2)
|
||||
assert_equal_enough(a, rev1)
|
||||
|
||||
# def test_whitespace_2():
|
||||
# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
|
||||
# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
|
||||
# matcher = WikiDiffMatcher([rev1,rev2])
|
||||
# diff_processor = matcher.processor()
|
||||
|
||||
# # note that a and b are constructed from the diffs.
|
||||
# # so they reflect the state of the text according to the diff processor
|
||||
# ops, a, b = diff_processor.process(rev1)
|
||||
# ops, a, b = diff_processor.process(rev2)
|
||||
# assert_equal_enough(b, rev2)
|
||||
# assert_equal_enough(a, rev1)
|
||||
|
||||
|
||||
|
||||
def test_actually_equal():
|
||||
rev1 = open("test/test_diff_revisions/1285792388").read()
|
||||
# whitespace is added because exact identity reverts do not result in diffs.
|
||||
matcher = WikiDiffMatcher([rev1,rev1])
|
||||
diff_processor = matcher.processor()
|
||||
ops, a, b = diff_processor.process(rev1)
|
||||
ops, a, b = diff_processor.process(rev1)
|
||||
assert len(ops) == 1
|
||||
assert isinstance(ops[0], Equal)
|
||||
|
||||
# note that the whitespace token does not result in a token according to wikitext_split
|
||||
# compare the tokens based on the diffs to the baseline
|
||||
# whitespace differences are allowed
|
||||
assert_equal_enough(b, rev1)
|
||||
assert_equal_enough(a, rev1)
|
||||
|
||||
# slow test
|
||||
def test_diff_consistency():
|
||||
from mwxml import Dump
|
||||
|
@ -3,25 +3,37 @@ import sys
|
||||
from collections import namedtuple
|
||||
from itertools import chain
|
||||
from typing import Dict, Generator, List, Optional, Tuple
|
||||
|
||||
from sortedcontainers import SortedDict
|
||||
import requests
|
||||
from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token,
|
||||
RegexTokenizer, tokenizers)
|
||||
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
||||
RegexTokenizer, Token, tokenizers)
|
||||
|
||||
TOKENIZER = tokenizers.wikitext_split
|
||||
|
||||
# def find_greatest_le_key(target_key, data_dict):
|
||||
# found_key = None
|
||||
# for key in data_dict: # Iterates over keys in insertion order (which is sorted)
|
||||
# if key <= target_key:
|
||||
# found_key = (
|
||||
# key # This is the largest key found so far that satisfies the condition
|
||||
# )
|
||||
# else:
|
||||
# # Since the dictionary is sorted, if key > target_key,
|
||||
# # all subsequent keys will also be > target_key.
|
||||
# return found_key or key
|
||||
|
||||
# def find_smallest_gt_key(target_key, data_dict):
|
||||
# found_key = None
|
||||
# for key in reversed(data_dict): # Iterates over keys in insertion order (which is sorted)
|
||||
# if key >= target_key:
|
||||
# found_key = (
|
||||
# key # This is the largest key found so far that satisfies the condition
|
||||
# )
|
||||
# else:
|
||||
# # Since the dictionary is sorted, if key > target_key,
|
||||
# # all subsequent keys will also be > target_key.
|
||||
# return found_key or key
|
||||
|
||||
def find_greatest_le_key(target_key, data_dict):
|
||||
found_key = None
|
||||
for key in data_dict: # Iterates over keys in insertion order (which is sorted)
|
||||
if key <= target_key:
|
||||
found_key = (
|
||||
key # This is the largest key found so far that satisfies the condition
|
||||
)
|
||||
else:
|
||||
# Since the dictionary is sorted, if key > target_key,
|
||||
# all subsequent keys will also be > target_key.
|
||||
return found_key or key
|
||||
|
||||
|
||||
def compute_diffs(url: str, texts: list[str]) -> list:
|
||||
@ -61,55 +73,94 @@ class DiffToOperationMap:
|
||||
self.diff = json.loads(diff)
|
||||
|
||||
# the code below is designed to work in bytes because that's how wikidiff2 indexes
|
||||
self.from_last_end_bytes = 0
|
||||
self.from_last_to_bytes = 0
|
||||
self.n_from_start_tokens = 0
|
||||
self.n_from_end_tokens = 0
|
||||
self.n_from_start_tokens = 0
|
||||
self.n_to_start_tokens = 0
|
||||
self.last_to_start_line = 0
|
||||
self.last_from_start_line = 0
|
||||
self.from_last_end_bytes = 0
|
||||
self.to_last_end_bytes = 0
|
||||
# self.from_last_end_bytes = 0
|
||||
# self.from_last_to_bytes = 0
|
||||
# self.n_from_start_tokens = 0
|
||||
# self.n_from_end_tokens = 0
|
||||
# self.n_from_start_tokens = 0
|
||||
# self.n_to_start_tokens = 0
|
||||
# self.from_last_end_bytes = 0
|
||||
# self.to_last_end_bytes = 0
|
||||
# keeps track of the number of tokens seen so far
|
||||
# to avoid repeated tokenization
|
||||
self.from_byte_token_index_map: Dict[int, int] = {}
|
||||
self.to_byte_token_index_map: Dict[int, int] = {}
|
||||
# self.from_byte_token_index_map: SortedDict[int, int] = SortedDict()
|
||||
# self.to_byte_token_index_map: SortedDict[int, int] = SortedDict()
|
||||
self.par_move_dict = {}
|
||||
|
||||
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
|
||||
self.to_linenumber_bytes_map = {}
|
||||
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
||||
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
||||
# def get_token_offset(self, byte_offset):
|
||||
# from_token_start = None
|
||||
# to_token_start = None
|
||||
# from_last_end_bytes = self.from_byte_token_index_map.keys()[-1]
|
||||
# to_last_end_bytes = self.to_byte_token_index_map.keys()[-1]
|
||||
# if byte_offset['from'] is not None:
|
||||
# if byte_offset['from'] < self.from_byte_token_index_map.values()[0]:
|
||||
# from_token_start = 0
|
||||
# else:
|
||||
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from'])
|
||||
# # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below?
|
||||
# if key > from_last_end_bytes:
|
||||
# from_token_start = self.from_byte_token_index_map[from_last_end_bytes]
|
||||
# else:
|
||||
# from_token_
|
||||
# if byte_offset['to'] is not None:
|
||||
# if byte_offset['to'] < self.to_byte_token_index_map.values()[0]:
|
||||
# to_token_start = 0
|
||||
# else:
|
||||
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to'])
|
||||
# if key >= from
|
||||
# if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0:
|
||||
# if (
|
||||
# byte_offset['from'] >= self.from_last_end_bytes
|
||||
# ): # if the from paragraph is at the end
|
||||
# from_token_start = next(
|
||||
# reversed(self.from_byte_token_index_map.values())
|
||||
# )
|
||||
# else:
|
||||
# key = find_greatest_le_key(
|
||||
# byte_offset['from'], self.from_byte_token_index_map
|
||||
# )
|
||||
# from_token_start = self.from_byte_token_index_map[key]
|
||||
# else:
|
||||
# from_token_start = 0
|
||||
|
||||
# to_offset = None
|
||||
# if byte_offset['to'] is not None:
|
||||
# if len(self.to_byte_token_index_map) > 0:
|
||||
# if to_byte_start >= self.to_last_end_bytes:
|
||||
# to_token_start = next(reversed(self.to_byte_token_index_map.values()))
|
||||
# else:
|
||||
# key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map)
|
||||
# to_token_start = self.to_byte_token_index_map[key]
|
||||
# else:
|
||||
# to_token_start = 0
|
||||
|
||||
# return {'from': from_token_start,
|
||||
# 'to': to_token_start}
|
||||
|
||||
def tokenize(self, bytes):
|
||||
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
||||
|
||||
def newline_result(self):
|
||||
self.n_from_end_tokens += 1
|
||||
self.n_from_start_tokens += 1
|
||||
self.n_to_end_tokens += 1
|
||||
self.n_to_start_tokens +=1
|
||||
|
||||
return (Equal(self.n_from_start_tokens - 1,
|
||||
self.n_from_end_tokens,
|
||||
self.n_to_start_tokens - 1,
|
||||
self.n_from_start_tokens),
|
||||
[Token('\n')],
|
||||
[Token('\n')])
|
||||
|
||||
|
||||
def to_operations(self):
|
||||
parmoves = []
|
||||
[print(diff) for diff in self.diff["diff"][0:5]]
|
||||
|
||||
for entry in self.diff["diff"]:
|
||||
|
||||
entry["text"] += "\n"
|
||||
text = entry["text"]
|
||||
offset = entry["offset"]
|
||||
if offset["from"] and entry.get("lineNumber") is not None :
|
||||
if entry['type'] in [0, 2, 3, 4]:
|
||||
self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode())
|
||||
|
||||
if offset["to"]:
|
||||
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"]
|
||||
if entry['type'] in [0, 1, 3, 5]:
|
||||
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode())
|
||||
|
||||
|
||||
# add back the newline
|
||||
|
||||
text = entry["text"]
|
||||
# ignore empty diffs. They don't have any tokens
|
||||
if len(text) == 0:
|
||||
continue
|
||||
# this is the first byte of the line in the 'from' revision.
|
||||
from_start_line = entry["offset"]["from"]
|
||||
# this is the first byte of the line in the 'to' revision.
|
||||
@ -117,23 +168,19 @@ class DiffToOperationMap:
|
||||
|
||||
if entry["type"] == 0:
|
||||
yield from self.doEqual(text, offset)
|
||||
yield self.newline_result()
|
||||
|
||||
# a line included in the 'to' revision, but not in the 'from' revision
|
||||
elif entry["type"] == 1:
|
||||
yield from self.doInsert(text, offset)
|
||||
yield self.newline_result()
|
||||
|
||||
# a line included in the 'from' revision, but not in the 'to' revision
|
||||
elif entry["type"] == 2:
|
||||
yield from self.doDelete(text, offset)
|
||||
yield self.newline_result()
|
||||
|
||||
elif entry["type"] == 3:
|
||||
yield from self.doHighlightRange(
|
||||
text, entry["highlightRanges"], offset, entry["lineNumber"]
|
||||
)
|
||||
yield self.newline_result()
|
||||
|
||||
elif entry["type"] == 4:
|
||||
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
||||
@ -141,34 +188,37 @@ class DiffToOperationMap:
|
||||
linkId = entry["moveInfo"]["linkId"]
|
||||
if linkId in self.par_move_dict:
|
||||
yield from self.doParMove(entry, self.par_move_dict[linkId])
|
||||
yield self.newline_result()
|
||||
else:
|
||||
# we need to count the tokens in the from revision so token index is correct
|
||||
self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
|
||||
self.n_from_start_tokens += len(
|
||||
self.tokenize(entry["text"].encode())
|
||||
)
|
||||
|
||||
# we need to count the tokens in the from revision so token index is correct
|
||||
# self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
|
||||
# self.n_from_start_tokens += len(
|
||||
# self.tokenize(entry["text"].encode())
|
||||
# )
|
||||
|
||||
elif entry["type"] == 5:
|
||||
linkId = entry["moveInfo"]["linkId"]
|
||||
if linkId in self.par_move_dict:
|
||||
yield from self.doParMove(self.par_move_dict[linkId], entry)
|
||||
yield self.newline_result()
|
||||
else:
|
||||
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
||||
# call doHighlightRange just to update the token indices
|
||||
offset = {
|
||||
"from": self.n_from_end_tokens,
|
||||
"to": entry["offset"]["to"],
|
||||
}
|
||||
res = self.doHighlightRange(
|
||||
entry["text"],
|
||||
entry["highlightRanges"],
|
||||
offset,
|
||||
entry["lineNumber"],
|
||||
update_idx="to",
|
||||
)
|
||||
list(res)
|
||||
# offset = {
|
||||
# "from": self.n_from_end_tokens,
|
||||
# "to": entry["offset"]["to"],
|
||||
# }
|
||||
# res = self.doHighlightRange(
|
||||
# entry["text"],
|
||||
# entry["highlightRanges"],
|
||||
# offset,
|
||||
# entry["lineNumber"],
|
||||
# update_idx="to",
|
||||
# )
|
||||
# list(res)
|
||||
# self.n_to_end_tokens += len(self.tokenize(entry["text"].encode()))
|
||||
# self.n_to_start_tokens += len(
|
||||
# self.tokenize(entry["text"].encode())
|
||||
# )
|
||||
|
||||
else:
|
||||
# The 'type' isn't one of the known
|
||||
raise ValueError(d)
|
||||
@ -180,99 +230,100 @@ class DiffToOperationMap:
|
||||
# strictly increasing, while the "from" segments should merely be
|
||||
# non-overlapping.
|
||||
|
||||
def doEqual(self, equal_segment, offset, update_idx="all", type=str):
|
||||
if type is str:
|
||||
def doEqual(self, equal_segment, offset, update_idx="all"):
|
||||
# if from_token_start is None:
|
||||
# from_token_start = self.n_from_start_tokens
|
||||
# if to_token_start is None:
|
||||
# to_token_start = self.n_to_start_tokens
|
||||
|
||||
if isinstance(equal_segment, str):
|
||||
equal_bytes = equal_segment.encode()
|
||||
elif type is bytes:
|
||||
elif isinstance(equal_segment, bytes):
|
||||
equal_bytes = equal_segment
|
||||
else:
|
||||
raise ValueError(equal_segment)
|
||||
|
||||
tokens = self.tokenize(equal_bytes)
|
||||
n_tokens = len(tokens)
|
||||
n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
||||
n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
||||
# we need to keep track of the to and from last end bytes
|
||||
self.from_last_end_bytes = offset["from"] + len(equal_bytes)
|
||||
self.to_last_end_bytes = offset["to"] + len(equal_bytes)
|
||||
|
||||
# token_offset = self.get_token_offset(offset)
|
||||
|
||||
# n_from_end_tokens = token_offset['from'] + n_tokens
|
||||
# n_to_end_tokens = token_offset['to'] + n_tokens
|
||||
|
||||
yield (
|
||||
Equal(
|
||||
self.n_from_start_tokens,
|
||||
n_from_end_tokens,
|
||||
self.n_to_start_tokens,
|
||||
n_to_end_tokens,
|
||||
offset['from'],
|
||||
None,
|
||||
offset['to'],
|
||||
None,
|
||||
),
|
||||
tokens,
|
||||
tokens,
|
||||
)
|
||||
|
||||
if update_idx in ["from", "all"]:
|
||||
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
|
||||
# if update_idx in ["from", "all"]:
|
||||
# self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
|
||||
|
||||
if update_idx in ["to", "all"]:
|
||||
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
|
||||
# if update_idx in ["to", "all"]:
|
||||
# self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
|
||||
|
||||
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
|
||||
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
|
||||
# self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
|
||||
# self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
|
||||
|
||||
def doInsert(self, insert_segment, offset, update_idx="all", type=str):
|
||||
if type is str:
|
||||
def doInsert(self, insert_segment, offset, update_idx="all"):
|
||||
if isinstance(insert_segment, str):
|
||||
insert_bytes = insert_segment.encode()
|
||||
elif type is bytes:
|
||||
elif isinstance(insert_segment, bytes):
|
||||
insert_bytes = insert_segment
|
||||
else:
|
||||
raise ValueError(insert_segment)
|
||||
tokens = self.tokenize(insert_bytes)
|
||||
n_tokens = len(tokens)
|
||||
n_to_end_tokens = self.n_to_start_tokens + n_tokens
|
||||
self.to_last_end_bytes = offset["to"] + len(insert_bytes)
|
||||
# n_tokens = len(tokens)
|
||||
# token_offset = self.get_token_offset(offset)
|
||||
# n_to_end_tokens = token_offset['to'] + n_tokens
|
||||
yield (
|
||||
Insert(
|
||||
self.n_from_start_tokens,
|
||||
self.n_from_start_tokens,
|
||||
self.n_to_start_tokens,
|
||||
n_to_end_tokens,
|
||||
None,
|
||||
None,
|
||||
offset['to'],
|
||||
None,
|
||||
),
|
||||
[],
|
||||
tokens,
|
||||
)
|
||||
# We have now used more of the "to" tokens.
|
||||
if update_idx in ["to", "all"]:
|
||||
self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
|
||||
|
||||
self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens
|
||||
#self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
|
||||
|
||||
def doDelete(self, delete_segment, offset, update_idx="all", type=str):
|
||||
if type is str:
|
||||
if isinstance(delete_segment, str):
|
||||
delete_bytes = delete_segment.encode()
|
||||
elif type is bytes:
|
||||
elif isinstance(delete_segment, bytes):
|
||||
delete_bytes = delete_segment
|
||||
else:
|
||||
raise ValueError(delete_segment)
|
||||
tokens = self.tokenize(delete_bytes)
|
||||
n_tokens = len(tokens)
|
||||
# n_tokens = len(tokens)
|
||||
|
||||
# token_offset = self.get_token_offset(offset)
|
||||
# n_from_end_tokens = token_offset['from'] + n_tokens
|
||||
|
||||
n_from_end_tokens = self.n_from_start_tokens + n_tokens
|
||||
self.from_last_end_bytes = offset["from"] + len(delete_bytes)
|
||||
yield (
|
||||
Delete(
|
||||
self.n_from_start_tokens,
|
||||
n_from_end_tokens,
|
||||
self.n_to_start_tokens,
|
||||
self.n_to_start_tokens,
|
||||
offset['from'],
|
||||
None,
|
||||
None,
|
||||
None
|
||||
),
|
||||
tokens,
|
||||
[],
|
||||
)
|
||||
# We have now used more of the "from" tokens.
|
||||
if update_idx in ["from", "all"]:
|
||||
self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
|
||||
|
||||
self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens
|
||||
#self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
|
||||
|
||||
def doHighlightRange(
|
||||
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"
|
||||
):
|
||||
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"):
|
||||
|
||||
# The text field is an overlapping mix of both the from and to,
|
||||
# so we need to handle it highlight-by-highlight.
|
||||
# there can be gaps between highlight segments.
|
||||
@ -283,11 +334,15 @@ class DiffToOperationMap:
|
||||
|
||||
# it's possible for offset['to'] to be null.
|
||||
# we can get it from the line number?
|
||||
|
||||
update_linenumber_map = True
|
||||
if offset["to"] is None:
|
||||
offset["to"] = self.from_byte_token_index_map[
|
||||
find_greatest_le_key(lineNumber, self.from_byte_token_index_map)
|
||||
]
|
||||
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
|
||||
if keyidx > 0:
|
||||
print(self.to_linenumber_bytes_map)
|
||||
key = self.to_linenumber_bytes_map.keys()[keyidx]
|
||||
offset["to"] = self.to_linenumber_bytes_map[key]
|
||||
else:
|
||||
offset["to"] = 0
|
||||
|
||||
highlight_offset = offset
|
||||
# note that diffs are token-level, but the indexes are byte-level
|
||||
@ -299,10 +354,12 @@ class DiffToOperationMap:
|
||||
equal_bytes = highlight_bytes[highlight_end:highlight_start]
|
||||
n_equal_bytes = len(equal_bytes)
|
||||
yield from self.doEqual(
|
||||
equal_bytes, highlight_offset, update_idx=update_idx, type=bytes
|
||||
equal_bytes, highlight_offset, update_idx=update_idx
|
||||
)
|
||||
highlight_offset["from"] += n_equal_bytes
|
||||
highlight_offset["to"] += n_equal_bytes
|
||||
if update_linenumber_map:
|
||||
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
|
||||
|
||||
# handle highlighted insert / delete
|
||||
highlight_end = highlight_start + highlightRange["length"]
|
||||
@ -311,12 +368,14 @@ class DiffToOperationMap:
|
||||
|
||||
if highlightRange["type"] == 0:
|
||||
yield from self.doInsert(
|
||||
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
|
||||
range_bytes, highlight_offset, update_idx=update_idx
|
||||
)
|
||||
highlight_offset["to"] += n_range_bytes
|
||||
if update_linenumber_map:
|
||||
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
|
||||
elif highlightRange["type"] == 1:
|
||||
yield from self.doDelete(
|
||||
range_bytes, highlight_offset, update_idx=update_idx, type=bytes
|
||||
range_bytes, highlight_offset, update_idx=update_idx
|
||||
)
|
||||
highlight_offset["from"] += n_range_bytes
|
||||
else:
|
||||
@ -325,46 +384,14 @@ class DiffToOperationMap:
|
||||
# handle the rest of the line which is equal
|
||||
if highlight_end < len(highlight_bytes):
|
||||
range_bytes = highlight_bytes[highlight_end:]
|
||||
yield from self.doEqual(range_bytes, highlight_offset, type=bytes)
|
||||
yield from self.doEqual(range_bytes, highlight_offset)
|
||||
|
||||
def doParMove(self, from_diff, to_diff):
|
||||
# the tricky part here is to put the tokens in the right spots.
|
||||
from_byte_start = from_diff["offset"]["from"]
|
||||
# as of python 3.7 dictionaries are in insertion order. So
|
||||
# we can just find the first key that's greater
|
||||
|
||||
# since the paragraph is removed in the "from" version, the index it is removed from
|
||||
# will be *after* the
|
||||
if len(self.from_byte_token_index_map) > 0:
|
||||
if (
|
||||
from_byte_start >= self.from_last_end_bytes
|
||||
): # if the from paragraph is at the end
|
||||
from_token_start = next(
|
||||
reversed(self.from_byte_token_index_map.values())
|
||||
)
|
||||
else:
|
||||
key = find_greatest_le_key(
|
||||
from_byte_start, self.from_byte_token_index_map
|
||||
)
|
||||
from_token_start = self.from_byte_token_index_map[key]
|
||||
else:
|
||||
from_token_start = 0
|
||||
|
||||
if len(self.to_byte_token_index_map) > 0:
|
||||
# get the to token index
|
||||
to_byte_start = to_diff["offset"]["to"]
|
||||
if to_byte_start >= self.to_last_end_bytes:
|
||||
to_token_start = next(reversed(self.to_byte_token_index_map.values()))
|
||||
else:
|
||||
key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map)
|
||||
to_token_start = self.to_byte_token_index_map[key]
|
||||
else:
|
||||
to_token_start = 0
|
||||
|
||||
# now we set the state and apply the highlights
|
||||
self.n_from_start_tokens = self.n_from_end_tokens = from_token_start
|
||||
self.n_to_start_tokens = self.n_to_end_tokens = to_token_start
|
||||
to_byte_start = to_diff["offset"]["to"]
|
||||
offset = {"from": from_byte_start, "to": to_byte_start}
|
||||
# we need to cache the indexes; replace them; then restore
|
||||
yield from self.doHighlightRange(
|
||||
to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
|
||||
)
|
||||
@ -397,35 +424,63 @@ class WikiDiffMatcher:
|
||||
diff = next(self.diffs)
|
||||
diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)
|
||||
|
||||
diffops = list(zip(*diffToOperationsMapper.to_operations()))
|
||||
diffops = list(diffToOperationsMapper.to_operations())
|
||||
|
||||
if not diffops:
|
||||
self.last_tokens = []
|
||||
return [], [], []
|
||||
# this happens when revisions are actually equal.
|
||||
if len(diffops) == 0:
|
||||
self.last_tokens = self.tokenizer.tokenize(text)
|
||||
ops = [Equal(0, len(self.last_tokens),
|
||||
0, len(self.last_tokens))]
|
||||
return ops, self.last_tokens, self.last_tokens
|
||||
|
||||
diffops = (
|
||||
operations,
|
||||
aseq,
|
||||
bseq,
|
||||
) = diffops
|
||||
# we get back the byte indices; now we transform to token indices
|
||||
|
||||
aseq = list(aseq)
|
||||
diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1))
|
||||
aorder_ops = []
|
||||
token_offset = 0
|
||||
_, aseq, _ = list(zip( * diffops))
|
||||
|
||||
# aseq/bseq can be out of order, we need to sort it by a1/b1 index.
|
||||
indices = list(range(len(aseq)))
|
||||
indices.sort(key=lambda i: operations[i].a1)
|
||||
aseq = [aseq[i] for i in indices]
|
||||
for op, tokens, _ in diffops:
|
||||
a1 = token_offset
|
||||
if isinstance(op, Equal) or isinstance(op, Delete):
|
||||
token_offset += len(tokens)
|
||||
a2 = token_offset
|
||||
aorder_ops.append(type(op)(a1,
|
||||
a2,
|
||||
op.b1,
|
||||
op.b1))
|
||||
else:
|
||||
aorder_ops.append(Insert(a1,
|
||||
a1,
|
||||
op.b1,
|
||||
op.b1))
|
||||
|
||||
bseq = list(bseq)
|
||||
indices = list(range(len(bseq)))
|
||||
indices.sort(key=lambda i: operations[i].b1)
|
||||
bseq = [bseq[i] for i in indices]
|
||||
_, aseq, bseq = zip(* diffops)
|
||||
diffops = list(zip(aorder_ops, aseq, bseq))
|
||||
diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1))
|
||||
_, _, bseq = list(zip(* diffops))
|
||||
border_ops = []
|
||||
token_offset = 0
|
||||
for op, _, tokens in diffops:
|
||||
b1 = token_offset
|
||||
if isinstance(op, Equal) or isinstance(op, Insert):
|
||||
token_offset += len(tokens)
|
||||
b2 = token_offset
|
||||
border_ops.append(type(op)(op.a1,
|
||||
op.a2,
|
||||
b1,
|
||||
b2))
|
||||
else:
|
||||
border_ops.append(type(op)(op.a1,
|
||||
op.a2,
|
||||
b1,
|
||||
b1))
|
||||
|
||||
self.previous_text = text
|
||||
|
||||
self.last_tokens = list(chain.from_iterable(aseq))
|
||||
tokens = list(chain.from_iterable(bseq))
|
||||
self.previous_text = text
|
||||
|
||||
return operations, self.last_tokens, tokens
|
||||
return border_ops, self.last_tokens, tokens
|
||||
|
||||
def processor(self, *args, **kwargs):
|
||||
return self.Processor(self.diffs, self.tokenizer)
|
||||
|
Loading…
Reference in New Issue
Block a user