486 lines
19 KiB
Python
486 lines
19 KiB
Python
import json
|
|
import sys
|
|
from collections import namedtuple
|
|
from itertools import chain
|
|
from typing import Dict, Generator, List, Optional, Tuple
|
|
|
|
import requests
|
|
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
|
RegexTokenizer, Token, tokenizers)
|
|
from sortedcontainers import SortedDict
|
|
|
|
TOKENIZER = tokenizers.wikitext_split
|
|
import pywikidiff2
|
|
differ = pywikidiff2.pywikidiff2(numContextLines=1000000,
|
|
moved_paragraph_detection_cutoff=200000)
|
|
|
|
def compute_diffs_server(texts, url="http://127.0.0.1:8000"):
|
|
response = None
|
|
try:
|
|
response = requests.post(url, json=texts)
|
|
response.raise_for_status()
|
|
incremental_diffs = response.json()
|
|
except requests.exceptions.ConnectionError as e:
|
|
print(
|
|
f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running."
|
|
)
|
|
print(e)
|
|
raise e
|
|
except requests.exceptions.HTTPError as e:
|
|
print(f"HTTP Error: {e}")
|
|
if response is not None:
|
|
print(f"Response Body: {response.text}")
|
|
raise e
|
|
except requests.exceptions.JSONDecodeError as e:
|
|
# Must come before RequestException as JSONDecodeError is
|
|
# a subclass.
|
|
print(f"JSON Decode Error: {e}", file=sys.stderr)
|
|
if response is not None:
|
|
print(f"Response Body: {response.text}", file=sys.stderr)
|
|
raise e
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"An unexpected error occurred: {e}")
|
|
raise e
|
|
return incremental_diffs
|
|
|
|
|
|
def compute_diffs(texts: list[str]) -> list:
|
|
return differ.inline_json_diff_sequence(texts)
|
|
|
|
class DiffToOperationMap:
|
|
def __init__(self, diff, tokenizer):
|
|
self.tokenizer = tokenizer
|
|
self.diff = json.loads(diff)
|
|
self.from_par_move_dict = {}
|
|
self.to_par_move_dict = {}
|
|
self.highlights_without_offset = []
|
|
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
|
|
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
|
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
|
|
|
def tokenize(self, bytes):
|
|
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
|
|
|
def to_operations(self):
|
|
for entry in self.diff["diff"]:
|
|
# add back the newline
|
|
entry["text"] += "\n"
|
|
text = entry["text"]
|
|
offset = entry["offset"]
|
|
# this is the first byte of the line in the 'from' revision.
|
|
from_start_line = entry["offset"]["from"]
|
|
# this is the first byte of the line in the 'to' revision.
|
|
to_start_line = entry["offset"]["to"]
|
|
|
|
if entry["type"] == 0:
|
|
yield from self.doEqual(entry)
|
|
|
|
# a line included in the 'to' revision, but not in the 'from' revision
|
|
elif entry["type"] == 1:
|
|
yield from self.doInsert(entry)
|
|
|
|
# a line included in the 'from' revision, but not in the 'to' revision
|
|
elif entry["type"] == 2:
|
|
yield from self.doDelete(entry)
|
|
|
|
elif entry["type"] == 3:
|
|
# sometimes, for some reason we don't have a 'to' index here. we'll save these for later
|
|
if entry["offset"]["to"] is None:
|
|
self.highlights_without_offset.append(entry)
|
|
else:
|
|
yield from self.doHighlightRange(entry)
|
|
|
|
elif entry["type"] == 4:
|
|
linkId = entry["moveInfo"]["linkId"]
|
|
|
|
if linkId in self.to_par_move_dict:
|
|
yield from self.doParMove(entry, self.to_par_move_dict.pop(linkId))
|
|
else:
|
|
self.from_par_move_dict[entry["moveInfo"]["id"]] = entry
|
|
|
|
elif entry["type"] == 5:
|
|
linkId = entry["moveInfo"]["linkId"]
|
|
if linkId in self.from_par_move_dict:
|
|
yield from self.doParMove(
|
|
self.from_par_move_dict.pop(linkId), entry
|
|
)
|
|
else:
|
|
self.to_par_move_dict[entry["moveInfo"]["id"]] = entry
|
|
else:
|
|
# The 'type' isn't one of the known
|
|
raise ValueError(d)
|
|
|
|
# now we should be able to apply highlights
|
|
|
|
for entry in self.highlights_without_offset:
|
|
yield from self.doHighlightRange(entry)
|
|
|
|
if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
|
print("PROBLEM! Unmatched parmoves!")
|
|
print(self.from_par_move_dict)
|
|
print(self.to_par_move_dict)
|
|
# We can try to match them:
|
|
for lkey in self.from_par_move_dict.keys():
|
|
for rkey in self.to_par_move_dict.keys():
|
|
from_diff = self.from_par_move_dict[lkey]
|
|
to_diff = self.to_par_move_dict[rkey]
|
|
if self.match_parmoves_exact(from_diff, to_diff):
|
|
yield from self.doParMove(from_diff, to_diff)
|
|
del self.to_par_move_dict[lkey]
|
|
del self.from_par_move_dict[rkey]
|
|
break
|
|
|
|
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
|
# print("Couldn't find exact matches for all parmoves!")
|
|
# # we couldn't find all the matches via exact match
|
|
# # let's try matching based on line number instead
|
|
# lkeys_to_remove = []
|
|
# for lkey, from_diff in self.from_par_move_dict.items():
|
|
# from_linenum = from_diff["moveInfo"]["linkId"].split("_")[2]
|
|
# rkey_to_remove = None
|
|
# for rkey, to_diff in self.to_par_move_dict.items():
|
|
# to_linenum = rkey.split("_")[2]
|
|
# if from_linenum == to_linenum:
|
|
# print("Matching on line number")
|
|
# yield from self.doParMove(from_diff, to_diff)
|
|
# rkey_to_remove = rkey
|
|
# lkeys_to_remove.append(lkey)
|
|
# break
|
|
# if rkey_to_remove is not None:
|
|
# del self.to_par_move_dict[rkey_to_remove]
|
|
# for lkey in lkeys_to_remove:
|
|
# del self.from_par_move_dict[lkey]
|
|
|
|
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
|
# print("Couldn't find exact matches for all parmoves!")
|
|
# # we couldn't find all the matches via exact match or line number
|
|
# # let's try matching based on opIndex instead
|
|
# lkeys_to_remove = []
|
|
# for lkey, from_diff in self.from_par_move_dict.items():
|
|
# rkey_to_remove = None
|
|
# from_idx = from_diff["moveInfo"]["linkId"].split("_")[1]
|
|
# for rkey, to_diff in self.to_par_move_dict.items():
|
|
# to_idx = rkey.split("_")[1]
|
|
# print(from_idx)
|
|
# print(to_idx)
|
|
# if from_idx == to_idx:
|
|
# yield from self.doParMove(from_diff, to_diff)
|
|
# rkey_to_remove = rkey
|
|
# lkeys_to_remove.append(lkey)
|
|
# if rkey_to_remove is not None:
|
|
# del self.to_par_move_dict[rkey_to_remove]
|
|
# for lkey in lkeys_to_remove:
|
|
# del self.from_par_move_dict[lkey]
|
|
|
|
# we couldn't find matches. treat type 4 as removal and type 5 as highlight.
|
|
for from_diff in self.from_par_move_dict.values():
|
|
yield from self.doDelete(from_diff)
|
|
|
|
# only we don't know the from index; we assume its already handled.
|
|
for to_diff in self.to_par_move_dict.values():
|
|
offset["from"] = 0
|
|
offset["to"] = None
|
|
diffops = self.doHighlightRange(
|
|
{
|
|
"text": to_diff["text"],
|
|
"highlightRanges": to_diff["highlightRanges"],
|
|
'offset': offset,
|
|
'lineNumber': to_diff["lineNumber"],
|
|
}
|
|
)
|
|
diffops = [
|
|
(type(op)(None, None, op.b1, op.b2), [], bseq)
|
|
for op, _, bseq in diffops
|
|
if isinstance(op, Insert) or isinstance(op, Equal)
|
|
]
|
|
yield from diffops
|
|
|
|
def match_parmoves_exact(self, from_diff, to_diff):
|
|
ops, from_tokens, to_tokens = list(zip(*self.doParMove(from_diff, to_diff)))
|
|
from_text = "".join(chain.from_iterable(from_tokens))
|
|
# we know they match if we apply the highlight ranges and the "from" tokens equal the lhs tokens.
|
|
if from_text == from_diff["text"]:
|
|
print("MATCH FOUND")
|
|
return True
|
|
else:
|
|
print("NO MATCH")
|
|
print(len(from_text))
|
|
print(len(from_diff["text"]))
|
|
return False
|
|
|
|
# mwpersistence expects differences to be represented in order from the
|
|
# result's perspective ("to"), not the previous text. Thus, if a line
|
|
# is moved earlier then its insertion should appear before its deletion.
|
|
# As a rule of thumb, the "to" segments should be non-overlapping and
|
|
# strictly increasing, while the "from" segments should merely be
|
|
# non-overlapping.
|
|
|
|
def doEqual(self, entry):
|
|
equal_segment, offset, lineNumber = (
|
|
entry["text"],
|
|
entry["offset"],
|
|
entry["lineNumber"],
|
|
)
|
|
if isinstance(equal_segment, str):
|
|
equal_bytes = equal_segment.encode()
|
|
elif isinstance(equal_segment, bytes):
|
|
equal_bytes = equal_segment
|
|
else:
|
|
raise ValueError(equal_segment)
|
|
|
|
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(equal_bytes)
|
|
self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(equal_bytes)
|
|
|
|
tokens = self.tokenize(equal_bytes)
|
|
n_tokens = len(tokens)
|
|
yield (
|
|
Equal(
|
|
offset["from"],
|
|
None,
|
|
offset["to"],
|
|
None,
|
|
),
|
|
tokens,
|
|
tokens,
|
|
)
|
|
|
|
def doInsert(self, entry):
|
|
insert_segment, offset, lineNumber = (
|
|
entry["text"],
|
|
entry["offset"],
|
|
entry["lineNumber"],
|
|
)
|
|
if isinstance(insert_segment, str):
|
|
insert_bytes = insert_segment.encode()
|
|
elif isinstance(insert_segment, bytes):
|
|
insert_bytes = insert_segment
|
|
else:
|
|
raise ValueError(insert_segment)
|
|
tokens = self.tokenize(insert_bytes)
|
|
self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(insert_bytes)
|
|
yield (
|
|
Insert(
|
|
None,
|
|
None,
|
|
offset["to"],
|
|
None,
|
|
),
|
|
[],
|
|
tokens,
|
|
)
|
|
|
|
def doDelete(self, entry):
|
|
delete_segment, offset, lineNumber = (
|
|
entry["text"],
|
|
entry["offset"],
|
|
entry.get("lineNumber", None),
|
|
)
|
|
if isinstance(delete_segment, str):
|
|
delete_bytes = delete_segment.encode()
|
|
elif isinstance(delete_segment, bytes):
|
|
delete_bytes = delete_segment
|
|
else:
|
|
raise ValueError(delete_segment)
|
|
tokens = self.tokenize(delete_bytes)
|
|
if lineNumber is not None:
|
|
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes)
|
|
|
|
yield (
|
|
Delete(offset["from"], None, None, None),
|
|
tokens,
|
|
[],
|
|
)
|
|
|
|
def doHighlightRange(self, entry):
|
|
highlight_text, highlightRanges, offset, lineNumber = (
|
|
entry["text"],
|
|
entry["highlightRanges"],
|
|
entry["offset"],
|
|
entry["lineNumber"],
|
|
)
|
|
|
|
# The text field is an overlapping mix of both the from and to,
|
|
# so we need to handle it highlight-by-highlight.
|
|
# there can be gaps between highlight segments.
|
|
# for instance, if a word is deleted from the middle of a line.
|
|
# we need to track that.
|
|
highlight_bytes = highlight_text.encode()
|
|
highlight_end = 0
|
|
|
|
# it's possible for offset['to'] to be null.
|
|
# we can get it from the line number?
|
|
# this bit is a little hacky as it deals with ideosyncratic wikidiff2 behavior
|
|
if offset["to"] is None:
|
|
# if the line already exists, we insert before it.
|
|
if lineNumber in self.to_linenumber_bytes_map:
|
|
keyidx = self.to_linenumber_bytes_map.bisect_left(lineNumber) - 1
|
|
else:
|
|
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
|
|
key = None
|
|
if keyidx == -1:
|
|
offset["to"] = 0
|
|
elif len(self.to_linenumber_bytes_map.keys()) > 0:
|
|
key = self.to_linenumber_bytes_map.keys()[keyidx]
|
|
else:
|
|
key = 0
|
|
if key is not None:
|
|
offset["to"] = self.to_linenumber_bytes_map.get(key, 0)
|
|
|
|
highlight_offset = offset
|
|
# note that diffs are token-level, but the indexes are byte-level
|
|
|
|
for highlightRange in highlightRanges:
|
|
highlight_start = highlightRange["start"]
|
|
# equal bytes in between highlights
|
|
if highlight_start > highlight_end:
|
|
equal_bytes = highlight_bytes[highlight_end:highlight_start]
|
|
n_equal_bytes = len(equal_bytes)
|
|
|
|
yield from self.doEqual(
|
|
{
|
|
"text": equal_bytes,
|
|
"offset": highlight_offset,
|
|
"lineNumber": lineNumber,
|
|
}
|
|
)
|
|
highlight_offset["from"] += n_equal_bytes
|
|
highlight_offset["to"] += n_equal_bytes
|
|
|
|
# handle highlighted insert / delete
|
|
highlight_end = highlight_start + highlightRange["length"]
|
|
range_bytes = highlight_bytes[highlight_start:highlight_end]
|
|
n_range_bytes = len(range_bytes)
|
|
|
|
if highlightRange["type"] == 0:
|
|
yield from self.doInsert(
|
|
{
|
|
"text": range_bytes,
|
|
"offset": highlight_offset,
|
|
"lineNumber": lineNumber,
|
|
}
|
|
)
|
|
highlight_offset["to"] += n_range_bytes
|
|
elif highlightRange["type"] == 1:
|
|
yield from self.doDelete(
|
|
{
|
|
"text": range_bytes,
|
|
"offset": highlight_offset,
|
|
"lineNumber": lineNumber,
|
|
}
|
|
)
|
|
highlight_offset["from"] += n_range_bytes
|
|
else:
|
|
raise Exception(entry)
|
|
|
|
# handle the rest of the line which is equal
|
|
if highlight_end < len(highlight_bytes):
|
|
range_bytes = highlight_bytes[highlight_end:]
|
|
yield from self.doEqual(
|
|
{
|
|
"text": range_bytes,
|
|
"offset": highlight_offset,
|
|
"lineNumber": lineNumber,
|
|
}
|
|
)
|
|
|
|
def doParMove(self, from_diff, to_diff):
|
|
from_byte_start = from_diff["offset"]["from"]
|
|
to_byte_start = to_diff["offset"]["to"]
|
|
offset = {"from": from_byte_start, "to": to_byte_start}
|
|
yield from self.doHighlightRange(
|
|
{
|
|
"text": to_diff["text"],
|
|
"highlightRanges": to_diff["highlightRanges"],
|
|
'offset': offset,
|
|
'lineNumber': to_diff["lineNumber"],
|
|
}
|
|
)
|
|
|
|
|
|
class WikiDiffMatcher:
|
|
def __init__(
|
|
self,
|
|
texts: list[str] = None,
|
|
tokenizer: Optional[RegexTokenizer] = None,
|
|
url: Optional[str] = "http://127.0.0.1:8000",
|
|
server=False
|
|
):
|
|
# Pre-compute diffs to reduce traffic overhead.
|
|
if server is True:
|
|
self.diffs = list(compute_diffs_server(list(texts),url))
|
|
else:
|
|
self.diffs = list(compute_diffs(list(texts)))
|
|
self.tokenizer = tokenizer or TOKENIZER
|
|
|
|
class Processor(DiffEngine.Processor):
|
|
def __init__(self, texts, tokenizer=None):
|
|
self.diffs = iter(texts)
|
|
self.tokenizer = tokenizer or TOKENIZER
|
|
self.last_tokens = []
|
|
self.previous_text = ""
|
|
|
|
def update(self, last_tokens):
|
|
self.last_tokens = last_tokens
|
|
|
|
def process(self, text, token_class=None):
|
|
# The diff has already been computed, but we need to incrementally
|
|
# retrieve it to recreate the behavior DiffState expects.
|
|
diff = next(self.diffs)
|
|
diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)
|
|
|
|
diffops = list(diffToOperationsMapper.to_operations())
|
|
|
|
# this happens when revisions are actually equal.
|
|
if len(diffops) == 0:
|
|
self.last_tokens = self.tokenizer.tokenize(text)
|
|
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
|
|
return ops, self.last_tokens, self.last_tokens
|
|
|
|
# we get back the byte indices; now we transform to token indices
|
|
|
|
diffops.sort(
|
|
key=lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)
|
|
)
|
|
aorder_ops = []
|
|
token_offset = 0
|
|
_, aseq, _ = list(zip(*diffops))
|
|
|
|
for op, tokens, _ in diffops:
|
|
a1 = token_offset
|
|
if isinstance(op, Equal) or isinstance(op, Delete):
|
|
token_offset += len(tokens)
|
|
a2 = token_offset
|
|
aorder_ops.append(type(op)(a1, a2, op.b1, op.b1))
|
|
else:
|
|
aorder_ops.append(Insert(a1, a1, op.b1, op.b1))
|
|
|
|
_, aseq, bseq = zip(*diffops)
|
|
diffops = list(zip(aorder_ops, aseq, bseq))
|
|
diffops.sort(
|
|
key=lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)
|
|
)
|
|
_, _, bseq = list(zip(*diffops))
|
|
border_ops = []
|
|
token_offset = 0
|
|
for op, _, tokens in diffops:
|
|
b1 = token_offset
|
|
if isinstance(op, Equal) or isinstance(op, Insert):
|
|
token_offset += len(tokens)
|
|
b2 = token_offset
|
|
border_ops.append(type(op)(op.a1, op.a2, b1, b2))
|
|
else:
|
|
border_ops.append(type(op)(op.a1, op.a2, b1, b1))
|
|
|
|
self.previous_text = text
|
|
|
|
self.last_tokens = list(chain.from_iterable(aseq))
|
|
tokens = list(chain.from_iterable(bseq))
|
|
return border_ops, self.last_tokens, tokens
|
|
|
|
def processor(self, *args, **kwargs):
|
|
return self.Processor(self.diffs, self.tokenizer)
|
|
|
|
def process(self):
|
|
# DiffState checks for this method even though it is not called.
|
|
raise Exception("Unnecessary implementation")
|