mediawiki_dump_tools/wiki_diff_matcher.py

import json
import sys

import requests
from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete

TOKENIZER = tokenizers.wikitext_split


def compute_diffs(url: str, texts: list[str]) -> list:
    response = None
    try:
        response = requests.post(url, json=texts)
        response.raise_for_status()
        incremental_diffs = response.json()
    except requests.exceptions.ConnectionError as e:
        print(
            f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.")
        print(e)
        raise e
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
        if response is not None:
            print(f"Response Body: {response.text}")
        raise e
    except requests.exceptions.JSONDecodeError as e:
        # Must come before RequestException as JSONDecodeError is
        # a subclass.
        print(f"JSON Decode Error: {e}", file=sys.stderr)
        if response is not None:
            print(f"Response Body: {response.text}", file=sys.stderr)
        raise e
    except requests.exceptions.RequestException as e:
        print(f"An unexpected error occurred: {e}")
        raise e

    return incremental_diffs


def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list:
    d = json.loads(diff)

    # the code below is designed to work in bytes because that's how wikidiff2 indexes
    from_text = from_text.encode('utf-8')
    to_text = to_text.encode('utf-8')

    # convinient function for tokenizing bytes
    def tokenize(bytes):
        return tokenizer.tokenize(bytes.decode('utf-8'))

    # Keep track of the last difference we saw in order to notice unaccounted-for
    # tokens. Each token at the end of "to" which is skipped for the next diff
    # must be represented as an "Equal()" segment.
    from_last_end_bytes = 0
    to_last_end_bytes = 0

    result = []
    # DiffState expects differences to be represented in order from the
    # result's perspective ("to"), not the previous text. Thus, if a line
    # is moved earlier then its insertion should appear before its deletion.
    # As a rule of thumb, the "to" segments should be non-overlapping and
    # strictly increasing, while the "from" segments should merely be
    # non-overlapping.

    # wikidiff2 appears to follow this same convention, but this behavior
    # is not documented.

    # Note that, confusingly for Insert operations only the "to" indexes matter
    # and for the Delete and Equal operations only the "from" indexes matter.
    # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas`

    parmove_from_dict = {} # lookup move diffs based on moveinfo id.
    parmove_to_dict = {}

    for entry in d['diff']:
        linebytes = entry['text'].encode('utf-8')
        from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision.
        to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision.

        from_start_tokens = len(tokenize(from_text[:from_start_line]))
        to_start_tokens = len(tokenize(to_text[:to_start_line]))
        # These constant calls to tokenizer.tokenize can definitely be optimized
        # as tokenization is currently a bottleneck. Ideally tokenization would
        # happen incrementally where possible, or somehow be cached, but this
        # would be more complex. N: I think it's okay. CPU is cheap.

        if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0.

            line_tokens = len(tokenize(linebytes))
            from_end_tokens = from_start_tokens + line_tokens
            to_end_tokens = to_start_tokens + line_tokens

            result.append(Equal(from_start_tokens, from_end_tokens,
                                to_start_tokens, to_end_tokens))

            # we need to keep track of the to and from last end bytes
            from_last_end_bytes += len(linebytes)
            to_last_end_bytes  += len(linebytes)

            continue
        else:
            # These do not appear to be generated by wikidiff2, and so must be
            # inferred.
            equal_tokens = to_start_tokens - to_last_end_bytes
            # If we notice that the next non-zero segment (which must be a
            # change, given that its type is non-zero), begins after the end
            # of the previous segment, we must add an Equal segment.
            # TODO: While the "to" token ranges are correct,
            # the "from"
            #  ranges are likely not, particularly in histories with paragraph
            #  moves. they can be corrected.
            if equal_tokens > 0:
                # only the 'from' indexes matter
                result.append(Equal(from_last_end_bytes, from_start_line,
                                    to_last_end_bytes, to_start_line))

        if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision
            line_tokens = len(tokenize(linebytes))
            to_end_tokens = to_start_tokens + line_tokens

            result.append(Insert(from_start_tokens, from_start_tokens,
                                 to_start_tokens, to_end_tokens,
                                 ))

            # We have now used more of the "to" tokens.
            to_start_end = to_end_tokens

        elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision
            line_tokens = len(tokenize(linebytes))
            from_end_tokens = from_start_tokens + line_tokens

            result.append(Delete(from_start_tokens, from_end_tokens,
                                 to_start_tokens, to_start_tokens,
                                 ))

            # We have now used more of the "from" tokens.
            from_last_end_bytes = from_end_tokens

        elif entry['type'] == 3:
            # The text field is an overlapping mix of both the from and to,
            # so we need to handle it highlight-by-highlight.
            # there can be gaps between highlight segments.
            # for instance, if a word is deleted from the middle of a line.
            # we need to track that.
            highlight_last_end = 0

            # note that diffs are token-level, but the indexes are byte-level
            for highlightRange in entry['highlightRanges']:
                if highlightRange['start'] > highlight_last_end:

                    equal_bytes = linebytes[highlight_last_end:highlightRange['start']]
                    equal_tokens = len(tokenize(equal_bytes))
                    from_end_tokens = from_start_tokens + equal_tokens
                    to_end_tokens = to_end_tokens + equal_tokens
                    result.append(Equal(from_start_tokens, from_end_tokens,
                                        to_start_tokens, to_end_tokens
                                        ))

                    from_start_tokens = from_end_tokens
                    to_start_tokens = to_end_tokens

                rangeStart = highlightRange['start']
                rangeEnd = rangeStart + highlightRange['length']
                range_bytes = linebytes[rangeStart:rangeEnd]
                range_tokens = len(tokenize(range_bytes))
                if highlightRange['type'] == 0:
                    # Insertion
                    to_end_tokens = to_start_tokens + range_tokens
                    result.append(Insert(from_start_tokens, from_end_tokens,
                                         to_start_tokens, to_end_tokens))

                    to_start_tokens = to_end_tokens
                elif highlightRange['type'] == 1:
                    # Deletion
                    from_end_tokens = from_start_tokens + range_tokens
                    result.append(Delete(from_start_tokens, from_end_tokens,
                                         to_start_tokens, to_end_tokens))
                    from_start_tokens = from_end_tokens

                else:
                    raise Exception(entry)

                highlight_last_end = highlightRange['start'] + highlightRange['length']

        elif entry['type'] == 4:

            parmove_from_dict['moveInfo']['id'] = diff

        elif entry['type'] == 5:

            parmove_to_dict['moveInfo']['id'] = diff
            # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff.
            # for deletions and equality report the token indexes from the 'from' revision.
        else:
            # The 'type' isn't one of the known
            raise ValueError(d)

    # now we go through the parmoves
    for id, from_diff in parmove_from_dict.items():
        to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']]
        ### TODO calculate the correct token indexes.

    # TODO: Handle trailing tokens

    # raise Exception(result)
    return result

class WikiDiffMatcher:
    def __init__(self,
                 url: str,
                 texts: list[str],
                 tokenizer: RegexTokenizer = None,
                 ):
        # Pre-compute diffs to reduce traffic overhead.
        self.diffs = compute_diffs(url, texts)
        self.tokenizer = tokenizer or TOKENIZER

    class Processor(DiffEngine.Processor):
        def __init__(self,
                     diffs,
                     tokenizer=None
                     ):
            self.diffs = iter(diffs)
            self.tokenizer = tokenizer or TOKENIZER
            self.last_tokens = []
            self.previous_text = ""

        def update(self, last_tokens):
            self.last_tokens = last_tokens

        def process(self, text, token_class=None):
            # IDEs will report the method signature as incorrect, but this is
            # expected. The DiffEngine.Processor class must be inherited from,
            # and its process definition incorrectly excludes a "self" argument.

            # The diff has already been computed, but we need to incrementally
            # retrieve it to recreate the behavior DiffState expects.
            diff = next(self.diffs)

            tokens = self.tokenizer.tokenize(text, token_class=token_class)
            operations = to_operations(self.previous_text, text, diff, self.tokenizer)

            a = self.last_tokens
            b = tokens
            self.last_tokens = tokens
            self.previous_text = text

            return operations, a, b

    def processor(self, *args, **kwargs):
        return self.Processor(self.diffs, self.tokenizer)


    def process(self):
        # DiffState checks for this method even though it is not called.
        raise Exception("Unnecessary implementation")