mediawiki_dump_tools/wiki_diff_matcher.py
2025-06-27 07:13:41 -07:00

257 lines
11 KiB
Python

import json
import sys
import requests
from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete
TOKENIZER = tokenizers.wikitext_split
def compute_diffs(url: str, texts: list[str]) -> list:
response = None
try:
response = requests.post(url, json=texts)
response.raise_for_status()
incremental_diffs = response.json()
except requests.exceptions.ConnectionError as e:
print(
f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
print(e)
raise e
except requests.exceptions.HTTPError as e:
print(f"HTTP Error: {e}")
if response is not None:
print(f"Response Body: {response.text}")
raise e
except requests.exceptions.JSONDecodeError as e:
# Must come before RequestException as JSONDecodeError is
# a subclass.
print(f"JSON Decode Error: {e}", file=sys.stderr)
if response is not None:
print(f"Response Body: {response.text}", file=sys.stderr)
raise e
except requests.exceptions.RequestException as e:
print(f"An unexpected error occurred: {e}")
raise e
return incremental_diffs
def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
d = json.loads(diff)
# the code below is designed to work in bytes because that's how wikidiff2 indexes
from_text = from_text.encode('utf-8')
to_text = to_text.encode('utf-8')
# convinient function for tokenizing bytes
def tokenize(bytes):
return tokenizer.tokenize(bytes.decode('utf-8'))
# Keep track of the last difference we saw in order to notice unaccounted-for
# tokens. Each token at the end of "to" which is skipped for the next diff
# must be represented as an "Equal()" segment.
from_last_end_bytes = 0
to_last_end_bytes = 0
result = []
# DiffState expects differences to be represented in order from the
# result's perspective ("to"), not the previous text. Thus, if a line
# is moved earlier then its insertion should appear before its deletion.
# As a rule of thumb, the "to" segments should be non-overlapping and
# strictly increasing, while the "from" segments should merely be
# non-overlapping.
# wikidiff2 appears to follow this same convention, but this behavior
# is not documented.
# Note that, confusingly for Insert operations only the "to" indexes matter
# and for the Delete and Equal operations only the "from" indexes matter.
# This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`
parmove_from_dict = {} # lookup move diffs based on moveinfo id.
parmove_to_dict = {}
for entry in d['diff']:
linebytes = entry['text'].encode('utf-8')
from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.
from_start_tokens = len(tokenize(from_text[:from_start_line]))
to_start_tokens = len(tokenize(to_text[:to_start_line]))
# These constant calls to tokenizer.tokenize can definitely be optimized
# as tokenization is currently a bottleneck. Ideally tokenization would
# happen incrementally where possible, or somehow be cached, but this
# would be more complex. N: I think it's okay. CPU is cheap.
if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.
line_tokens = len(tokenize(linebytes))
from_end_tokens = from_start_tokens + line_tokens
to_end_tokens = to_start_tokens + line_tokens
result.append(Equal(from_start_tokens, from_end_tokens,
to_start_tokens, to_end_tokens))
# we need to keep track of the to and from last end bytes
from_last_end_bytes += len(linebytes)
to_last_end_bytes += len(linebytes)
continue
else:
# These do not appear to be generated by wikidiff2, and so must be
# inferred.
equal_tokens = to_start_tokens - to_last_end_bytes
# If we notice that the next non-zero segment (which must be a
# change, given that its type is non-zero), begins after the end
# of the previous segment, we must add an Equal segment.
# TODO: While the "to" token ranges are correct,
# the "from"
# ranges are likely not, particularly in histories with paragraph
# moves. they can be corrected.
if equal_tokens > 0:
# only the 'from' indexes matter
result.append(Equal(from_last_end_bytes, from_start_line,
to_last_end_bytes, to_start_line))
if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
line_tokens = len(tokenize(linebytes))
to_end_tokens = to_start_tokens + line_tokens
result.append(Insert(from_start_tokens, from_start_tokens,
to_start_tokens, to_end_tokens,
))
# We have now used more of the "to" tokens.
to_start_end = to_end_tokens
elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
line_tokens = len(tokenize(linebytes))
from_end_tokens = from_start_tokens + line_tokens
result.append(Delete(from_start_tokens, from_end_tokens,
to_start_tokens, to_start_tokens,
))
# We have now used more of the "from" tokens.
from_last_end_bytes = from_end_tokens
elif entry['type'] == 3:
# The text field is an overlapping mix of both the from and to,
# so we need to handle it highlight-by-highlight.
# there can be gaps between highlight segments.
# for instance, if a word is deleted from the middle of a line.
# we need to track that.
highlight_last_end = 0
# note that diffs are token-level, but the indexes are byte-level
for highlightRange in entry['highlightRanges']:
if highlightRange['start'] > highlight_last_end:
equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
equal_tokens = len(tokenize(equal_bytes))
from_end_tokens = from_start_tokens + equal_tokens
to_end_tokens = to_end_tokens + equal_tokens
result.append(Equal(from_start_tokens, from_end_tokens,
to_start_tokens, to_end_tokens
))
from_start_tokens = from_end_tokens
to_start_tokens = to_end_tokens
rangeStart = highlightRange['start']
rangeEnd = rangeStart + highlightRange['length']
range_bytes = linebytes[rangeStart:rangeEnd]
range_tokens = len(tokenize(range_bytes))
if highlightRange['type'] == 0:
# Insertion
to_end_tokens = to_start_tokens + range_tokens
result.append(Insert(from_start_tokens, from_end_tokens,
to_start_tokens, to_end_tokens))
to_start_tokens = to_end_tokens
elif highlightRange['type'] == 1:
# Deletion
from_end_tokens = from_start_tokens + range_tokens
result.append(Delete(from_start_tokens, from_end_tokens,
to_start_tokens, to_end_tokens))
from_start_tokens = from_end_tokens
else:
raise Exception(entry)
highlight_last_end = highlightRange['start'] + highlightRange['length']
elif entry['type'] == 4:
parmove_from_dict['moveInfo']['id'] = diff
elif entry['type'] == 5:
parmove_to_dict['moveInfo']['id'] = diff
# for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
# for deletions and equality report the token indexes from the 'from' revision.
else:
# The 'type' isn't one of the known
raise ValueError(d)
# now we go through the parmoves
for id, from_diff in parmove_from_dict.items():
to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
### TODO calculate the correct token indexes.
# TODO: Handle trailing tokens
# raise Exception(result)
return result
class WikiDiffMatcher:
def __init__(self,
url: str,
texts: list[str],
tokenizer: RegexTokenizer = None,
):
# Pre-compute diffs to reduce traffic overhead.
self.diffs = compute_diffs(url, texts)
self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor):
def __init__(self,
diffs,
tokenizer=None
):
self.diffs = iter(diffs)
self.tokenizer = tokenizer or TOKENIZER
self.last_tokens = []
self.previous_text = ""
def update(self, last_tokens):
self.last_tokens = last_tokens
def process(self, text, token_class=None):
# IDEs will report the method signature as incorrect, but this is
# expected. The DiffEngine.Processor class must be inherited from,
# and its process definition incorrectly excludes a "self" argument.
# The diff has already been computed, but we need to incrementally
# retrieve it to recreate the behavior DiffState expects.
diff = next(self.diffs)
tokens = self.tokenizer.tokenize(text, token_class=token_class)
operations = to_operations(self.previous_text, text, diff, self.tokenizer)
a = self.last_tokens
b = tokens
self.last_tokens = tokens
self.previous_text = text
return operations, a, b
def processor(self, *args, **kwargs):
return self.Processor(self.diffs, self.tokenizer)
def process(self):
# DiffState checks for this method even though it is not called.
raise Exception("Unnecessary implementation")