import json import sys import requests from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete TOKENIZER = tokenizers.wikitext_split def compute_diffs(url: str, texts: list[str]) -> list: response = None try: response = requests.post(url, json=texts) response.raise_for_status() incremental_diffs = response.json() except requests.exceptions.ConnectionError as e: print( f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.") print(e) raise e except requests.exceptions.HTTPError as e: print(f"HTTP Error: {e}") if response is not None: print(f"Response Body: {response.text}") raise e except requests.exceptions.JSONDecodeError as e: # Must come before RequestException as JSONDecodeError is # a subclass. print(f"JSON Decode Error: {e}", file=sys.stderr) if response is not None: print(f"Response Body: {response.text}", file=sys.stderr) raise e except requests.exceptions.RequestException as e: print(f"An unexpected error occurred: {e}") raise e return incremental_diffs def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list: d = json.loads(diff) # the code below is designed to work in bytes because that's how wikidiff2 indexes from_text = from_text.encode('utf-8') to_text = to_text.encode('utf-8') # convinient function for tokenizing bytes def tokenize(bytes): return tokenizer.tokenize(bytes.decode('utf-8')) # Keep track of the last difference we saw in order to notice unaccounted-for # tokens. Each token at the end of "to" which is skipped for the next diff # must be represented as an "Equal()" segment. from_last_end_bytes = 0 to_last_end_bytes = 0 result = [] # DiffState expects differences to be represented in order from the # result's perspective ("to"), not the previous text. Thus, if a line # is moved earlier then its insertion should appear before its deletion. # As a rule of thumb, the "to" segments should be non-overlapping and # strictly increasing, while the "from" segments should merely be # non-overlapping. # wikidiff2 appears to follow this same convention, but this behavior # is not documented. # Note that, confusingly for Insert operations only the "to" indexes matter # and for the Delete and Equal operations only the "from" indexes matter. # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas` parmove_from_dict = {} # lookup move diffs based on moveinfo id. parmove_to_dict = {} for entry in d['diff']: linebytes = entry['text'].encode('utf-8') from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision. to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision. from_start_tokens = len(tokenize(from_text[:from_start_line])) to_start_tokens = len(tokenize(to_text[:to_start_line])) # These constant calls to tokenizer.tokenize can definitely be optimized # as tokenization is currently a bottleneck. Ideally tokenization would # happen incrementally where possible, or somehow be cached, but this # would be more complex. N: I think it's okay. CPU is cheap. if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0. line_tokens = len(tokenize(linebytes)) from_end_tokens = from_start_tokens + line_tokens to_end_tokens = to_start_tokens + line_tokens result.append(Equal(from_start_tokens, from_end_tokens, to_start_tokens, to_end_tokens)) # we need to keep track of the to and from last end bytes from_last_end_bytes += len(linebytes) to_last_end_bytes += len(linebytes) continue else: # These do not appear to be generated by wikidiff2, and so must be # inferred. equal_tokens = to_start_tokens - to_last_end_bytes # If we notice that the next non-zero segment (which must be a # change, given that its type is non-zero), begins after the end # of the previous segment, we must add an Equal segment. # TODO: While the "to" token ranges are correct, # the "from" # ranges are likely not, particularly in histories with paragraph # moves. they can be corrected. if equal_tokens > 0: # only the 'from' indexes matter result.append(Equal(from_last_end_bytes, from_start_line, to_last_end_bytes, to_start_line)) if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision line_tokens = len(tokenize(linebytes)) to_end_tokens = to_start_tokens + line_tokens result.append(Insert(from_start_tokens, from_start_tokens, to_start_tokens, to_end_tokens, )) # We have now used more of the "to" tokens. to_start_end = to_end_tokens elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision line_tokens = len(tokenize(linebytes)) from_end_tokens = from_start_tokens + line_tokens result.append(Delete(from_start_tokens, from_end_tokens, to_start_tokens, to_start_tokens, )) # We have now used more of the "from" tokens. from_last_end_bytes = from_end_tokens elif entry['type'] == 3: # The text field is an overlapping mix of both the from and to, # so we need to handle it highlight-by-highlight. # there can be gaps between highlight segments. # for instance, if a word is deleted from the middle of a line. # we need to track that. highlight_last_end = 0 # note that diffs are token-level, but the indexes are byte-level for highlightRange in entry['highlightRanges']: if highlightRange['start'] > highlight_last_end: equal_bytes = linebytes[highlight_last_end:highlightRange['start']] equal_tokens = len(tokenize(equal_bytes)) from_end_tokens = from_start_tokens + equal_tokens to_end_tokens = to_end_tokens + equal_tokens result.append(Equal(from_start_tokens, from_end_tokens, to_start_tokens, to_end_tokens )) from_start_tokens = from_end_tokens to_start_tokens = to_end_tokens rangeStart = highlightRange['start'] rangeEnd = rangeStart + highlightRange['length'] range_bytes = linebytes[rangeStart:rangeEnd] range_tokens = len(tokenize(range_bytes)) if highlightRange['type'] == 0: # Insertion to_end_tokens = to_start_tokens + range_tokens result.append(Insert(from_start_tokens, from_end_tokens, to_start_tokens, to_end_tokens)) to_start_tokens = to_end_tokens elif highlightRange['type'] == 1: # Deletion from_end_tokens = from_start_tokens + range_tokens result.append(Delete(from_start_tokens, from_end_tokens, to_start_tokens, to_end_tokens)) from_start_tokens = from_end_tokens else: raise Exception(entry) highlight_last_end = highlightRange['start'] + highlightRange['length'] elif entry['type'] == 4: parmove_from_dict['moveInfo']['id'] = diff elif entry['type'] == 5: parmove_to_dict['moveInfo']['id'] = diff # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. # for deletions and equality report the token indexes from the 'from' revision. else: # The 'type' isn't one of the known raise ValueError(d) # now we go through the parmoves for id, from_diff in parmove_from_dict.items(): to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']] ### TODO calculate the correct token indexes. # TODO: Handle trailing tokens # raise Exception(result) return result class WikiDiffMatcher: def __init__(self, url: str, texts: list[str], tokenizer: RegexTokenizer = None, ): # Pre-compute diffs to reduce traffic overhead. self.diffs = compute_diffs(url, texts) self.tokenizer = tokenizer or TOKENIZER class Processor(DiffEngine.Processor): def __init__(self, diffs, tokenizer=None ): self.diffs = iter(diffs) self.tokenizer = tokenizer or TOKENIZER self.last_tokens = [] self.previous_text = "" def update(self, last_tokens): self.last_tokens = last_tokens def process(self, text, token_class=None): # IDEs will report the method signature as incorrect, but this is # expected. The DiffEngine.Processor class must be inherited from, # and its process definition incorrectly excludes a "self" argument. # The diff has already been computed, but we need to incrementally # retrieve it to recreate the behavior DiffState expects. diff = next(self.diffs) tokens = self.tokenizer.tokenize(text, token_class=token_class) operations = to_operations(self.previous_text, text, diff, self.tokenizer) a = self.last_tokens b = tokens self.last_tokens = tokens self.previous_text = text return operations, a, b def processor(self, *args, **kwargs): return self.Processor(self.diffs, self.tokenizer) def process(self): # DiffState checks for this method even though it is not called. raise Exception("Unnecessary implementation")