From 186cb82fb8d82b4f8817676fe87b1498e88891e5 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Fri, 27 Jun 2025 07:13:41 -0700 Subject: [PATCH] some work on wiki_diff_matcher.py --- wiki_diff_matcher.py | 159 ++++++++++++++++++++++++++----------------- 1 file changed, 96 insertions(+), 63 deletions(-) diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py index 77c00ae..f930b2c 100644 --- a/wiki_diff_matcher.py +++ b/wiki_diff_matcher.py @@ -4,7 +4,8 @@ import sys import requests from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete -TOKENIZER = tokenizers.text_split +TOKENIZER = tokenizers.wikitext_split + def compute_diffs(url: str, texts: list[str]) -> list: response = None @@ -36,14 +37,22 @@ def compute_diffs(url: str, texts: list[str]) -> list: return incremental_diffs -def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list: +def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list: d = json.loads(diff) + # the code below is designed to work in bytes because that's how wikidiff2 indexes + from_text = from_text.encode('utf-8') + to_text = to_text.encode('utf-8') + + # convinient function for tokenizing bytes + def tokenize(bytes): + return tokenizer.tokenize(bytes.decode('utf-8')) + # Keep track of the last difference we saw in order to notice unaccounted-for # tokens. Each token at the end of "to" which is skipped for the next diff # must be represented as an "Equal()" segment. - from_last_end = 0 - to_last_end = 0 + from_last_end_bytes = 0 + to_last_end_bytes = 0 result = [] # DiffState expects differences to be represented in order from the @@ -52,56 +61,61 @@ def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> # As a rule of thumb, the "to" segments should be non-overlapping and # strictly increasing, while the "from" segments should merely be # non-overlapping. - # + # wikidiff2 appears to follow this same convention, but this behavior # is not documented. - for entry in d['diff']: - from_start_line = entry['offset']['from'] - to_start_line = entry['offset']['to'] - # Per above, to_start_line appears to be nondecreasing, but - # from_start_line may sometimes decrease for detected paragraph moves. + # Note that, confusingly for Insert operations only the "to" indexes matter + # and for the Delete and Equal operations only the "from" indexes matter. + # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas` - from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line])) - to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line])) + parmove_from_dict = {} # lookup move diffs based on moveinfo id. + parmove_to_dict = {} + + for entry in d['diff']: + linebytes = entry['text'].encode('utf-8') + from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision. + to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision. + + from_start_tokens = len(tokenize(from_text[:from_start_line])) + to_start_tokens = len(tokenize(to_text[:to_start_line])) # These constant calls to tokenizer.tokenize can definitely be optimized # as tokenization is currently a bottleneck. Ideally tokenization would # happen incrementally where possible, or somehow be cached, but this - # would be more complex. + # would be more complex. N: I think it's okay. CPU is cheap. - if entry['type'] == 0: - # wikidiff2 doesn't appear to emit diffs of this type, but cover anyway. - line_tokens = len(tokenizer.tokenize(entry['text'])) + if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0. + + line_tokens = len(tokenize(linebytes)) from_end_tokens = from_start_tokens + line_tokens to_end_tokens = to_start_tokens + line_tokens result.append(Equal(from_start_tokens, from_end_tokens, to_start_tokens, to_end_tokens)) - from_last_end = from_end_tokens - to_last_end = to_end_tokens + # we need to keep track of the to and from last end bytes + from_last_end_bytes += len(linebytes) + to_last_end_bytes += len(linebytes) continue else: # These do not appear to be generated by wikidiff2, and so must be # inferred. - equal_tokens = to_start_tokens - to_last_end + equal_tokens = to_start_tokens - to_last_end_bytes # If we notice that the next non-zero segment (which must be a # change, given that its type is non-zero), begins after the end # of the previous segment, we must add an Equal segment. - # TODO: While the "to" token ranges are correct, the "from" + # TODO: While the "to" token ranges are correct, + # the "from" # ranges are likely not, particularly in histories with paragraph - # moves. + # moves. they can be corrected. if equal_tokens > 0: - result.append(Equal(from_last_end, from_start_line, - to_last_end, to_start_line)) + # only the 'from' indexes matter + result.append(Equal(from_last_end_bytes, from_start_line, + to_last_end_bytes, to_start_line)) - - if entry['type'] == 1 or entry['type'] == 4: - # TODO: Separate out type 4 to recognize this is the insertion - # part of a paragraph move. Note that for paragraph moves - # the text is not necessarily identical, just similar. - line_tokens = len(tokenizer.tokenize(entry['text'])) + if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision + line_tokens = len(tokenize(linebytes)) to_end_tokens = to_start_tokens + line_tokens result.append(Insert(from_start_tokens, from_start_tokens, @@ -109,64 +123,83 @@ def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> )) # We have now used more of the "to" tokens. - to_last_end = to_end_tokens - elif entry['type'] == 2 or entry['type'] == 5: - # TODO: Separate out type 5 to recognize this is the deletion - # part of a paragraph move. Note that for paragraph moves - # the text is not necessarily identical, just similar. - line_tokens = len(tokenizer.tokenize(entry['text'])) + to_start_end = to_end_tokens + + elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision + line_tokens = len(tokenize(linebytes)) from_end_tokens = from_start_tokens + line_tokens result.append(Delete(from_start_tokens, from_end_tokens, to_start_tokens, to_start_tokens, )) - # We have not used more of the "from" tokens. - from_last_end = from_end_tokens + # We have now used more of the "from" tokens. + from_last_end_bytes = from_end_tokens + elif entry['type'] == 3: - # The text field is an overlapping mix of both the previous and next - # lines, and so we can't directly tokenize it. - - text = entry['text'] - - last_end = 0 - previous_line = "" - next_line = "" - - # A line will have one or more highlightRanges. - # It is not guaranteed that insertions/deletions are matched, + # The text field is an overlapping mix of both the from and to, + # so we need to handle it highlight-by-highlight. + # there can be gaps between highlight segments. # for instance, if a word is deleted from the middle of a line. - for highlightRange in entry['highlightRanges']: - if highlightRange['start'] > last_end: - previous_line += text[last_end:highlightRange['start']] - next_line += text[last_end:highlightRange['start']] - # Add an Equal segment. + # we need to track that. + highlight_last_end = 0 + # note that diffs are token-level, but the indexes are byte-level + for highlightRange in entry['highlightRanges']: + if highlightRange['start'] > highlight_last_end: + + equal_bytes = linebytes[highlight_last_end:highlightRange['start']] + equal_tokens = len(tokenize(equal_bytes)) + from_end_tokens = from_start_tokens + equal_tokens + to_end_tokens = to_end_tokens + equal_tokens + result.append(Equal(from_start_tokens, from_end_tokens, + to_start_tokens, to_end_tokens + )) + + from_start_tokens = from_end_tokens + to_start_tokens = to_end_tokens + rangeStart = highlightRange['start'] rangeEnd = rangeStart + highlightRange['length'] - + range_bytes = linebytes[rangeStart:rangeEnd] + range_tokens = len(tokenize(range_bytes)) if highlightRange['type'] == 0: # Insertion - next_line += text[rangeStart:rangeEnd] + to_end_tokens = to_start_tokens + range_tokens + result.append(Insert(from_start_tokens, from_end_tokens, + to_start_tokens, to_end_tokens)) - # Add an Insert segment. + to_start_tokens = to_end_tokens elif highlightRange['type'] == 1: # Deletion - previous_line += text[rangeStart:rangeEnd] - - # Add a Delete segment. + from_end_tokens = from_start_tokens + range_tokens + result.append(Delete(from_start_tokens, from_end_tokens, + to_start_tokens, to_end_tokens)) + from_start_tokens = from_end_tokens + else: raise Exception(entry) + + highlight_last_end = highlightRange['start'] + highlightRange['length'] - from_tokens = len(tokenizer.tokenize(previous_line)) - to_tokens = len(tokenizer.tokenize(next_line)) + elif entry['type'] == 4: - from_start_tokens += from_tokens - to_start_tokens += to_tokens + parmove_from_dict['moveInfo']['id'] = diff + + elif entry['type'] == 5: + + parmove_to_dict['moveInfo']['id'] = diff + # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. + # for deletions and equality report the token indexes from the 'from' revision. else: # The 'type' isn't one of the known raise ValueError(d) + # now we go through the parmoves + for id, from_diff in parmove_from_dict.items(): + to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']] + ### TODO calculate the correct token indexes. + # TODO: Handle trailing tokens # raise Exception(result)