diff --git a/runtest.sh b/runtest.sh index 3f38038..6c2d25a 100755 --- a/runtest.sh +++ b/runtest.sh @@ -1,2 +1,2 @@ #!/usr/bin/env bash -uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys +uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys diff --git a/test/test_wiki_diff_matcher.py b/test/test_wiki_diff_matcher.py index ea98184..8e0eabd 100644 --- a/test/test_wiki_diff_matcher.py +++ b/test/test_wiki_diff_matcher.py @@ -8,30 +8,36 @@ import pytest_asyncio from typing import List from deltas import Delete, Equal, Insert, wikitext_split from mwpersistence import Token + from wiki_diff_matcher import WikiDiffMatcher - -@pytest_asyncio.fixture(scope="module") +@pytest_asyncio.fixture(scope="module", autouse=True) async def start_stop_server(): + print("starting server") proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000", "wikidiff2_api.php", stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # php needs a moment to actually start + await asyncio.sleep(0.1) yield proc - stdout, stderr = await proc.communicate() - print(stdout.encode()) - print(stderr.encode()) + print("stopping server") proc.terminate() - - + stdout, stderr = await proc.communicate() + print(stdout.decode()) + print(stderr.decode()) + def assert_equal_enough(tokens:List[Token], rev): # the tokens exclude newlines # we allow extra whitespace at the beginning or end token_doc = ''.join(str(t) for t in tokens).strip() - rev_doc = rev.replace('\n','').strip() + while '\n\n' in token_doc: + token_doc = token_doc.replace('\n\n','\n') + while '\n\n' in rev: + rev = rev.replace('\n\n','\n').strip() print(token_doc, file = open('token','w')) - print(rev_doc, file = open('rev','w')) - assert token_doc == rev_doc + print(rev, file = open('rev','w')) + assert token_doc == rev def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_tokens): @@ -70,15 +76,59 @@ def test_equality(): diff_processor = matcher.processor() ops, a, b = diff_processor.process(rev1) ops, a, b = diff_processor.process(rev1 + " ") - assert len(ops) == 129 - for op in ops[:-1]: - assert isinstance(op, Equal) + assert len(ops) == 258 + for op in ops[:-2]: + print(op) + assert isinstance(op, Equal) # note that the whitespace token does not result in a token according to wikitext_split # compare the tokens based on the diffs to the baseline # whitespace differences are allowed assert_equal_enough(b, rev1) +def test_highlight_range_3(): + rev1 = open("test/test_diff_revisions/test_highlight_3_from").read() + rev2 = open("test/test_diff_revisions/test_highlight_3_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) + +def test_highlight_range_4(): + rev1 = open("test/test_diff_revisions/test_highlight_4_from").read() + rev2 = open("test/test_diff_revisions/test_highlight_4_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) + +def test_complex_diff(): + rev1 = open("test/test_diff_revisions/test_complex_from").read() + rev2 = open("test/test_diff_revisions/test_complex_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) + + + +def test_highlight_range_unicode(): + rev1 = open("test/test_diff_revisions/test_unicode_highlight_from").read() + rev2 = open("test/test_diff_revisions/test_unicode_highlight_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) + + def test_highlight_range(): rev1 = open("test/test_diff_revisions/1295229484_rangeedit0").read() rev2 = open("test/test_diff_revisions/1295229484_rangeedit1").read() @@ -108,28 +158,38 @@ def test_delete(): n_deleted_tokens = 0 last_b2 = initial_equal_tokens - initial_equal_lines = 2 - initial_equal_tokens = 12 + initial_equal_lines = 4 + initial_equal_tokens = 14 last_b2 = assert_correct_equal_section(ops, expected_equal_lines=initial_equal_lines, expected_equal_tokens=initial_equal_tokens) first_noninsert_token = initial_equal_tokens + last_non_delete = False + idx = 0 for op in ops[initial_equal_lines:]: + idx += 1 + # deletes are interleaved with Equal newlines. if not isinstance(op, Delete): - first_nondelete_token = op.a1 - break - n_deletes += 1 - n_deleted_tokens += op.a2 - last_b2 - last_b2 = op.a2 - + if last_non_delete: + first_nondelete_token = op.a1 + break + last_non_delete = True + else: + last_non_delete = False + if last_non_delete: + n_deletes += 1 + n_deleted_tokens += op.a2 - last_b2 + last_b2 = op.a2 + assert n_deletes == 2 - assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 316 + assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317 - last_b2 = assert_correct_equal_section(ops[initial_equal_lines + n_deletes:], - expected_equal_lines=126, - expected_equal_tokens=9323) + + last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:], + expected_equal_lines=252, + expected_equal_tokens=9765) # first lets test that we properly build the operations. @@ -144,8 +204,14 @@ def test_addition(): # so they reflect the state of the text according to the diff processor ops, a, b = diff_processor.process(rev1) + even = True for op in ops: - assert isinstance(op, Insert) + if even: + assert isinstance(op, Insert) + even = False + else: + assert isinstance(op, Equal) + even = True assert_equal_enough(b, rev1) @@ -155,21 +221,26 @@ def test_addition(): assert_equal_enough(a, rev1) assert_equal_enough(b, rev2) ops = list(ops) - initial_equal_lines = 128 - initial_equal_tokens = 9359 + initial_equal_lines = 256 + initial_equal_tokens = 9487 last_b2 = assert_correct_equal_section(ops, expected_equal_lines=initial_equal_lines, expected_equal_tokens=initial_equal_tokens) + last_non_insert = False first_noninsert_token = None n_inserts = 0 n_inserted_tokens = 0 - last_b2 = initial_equal_tokens + last_b2 = last_insert_b2 = initial_equal_tokens + idx = 0 + print(ops[initial_equal_lines:]) for op in ops[initial_equal_lines:]: - n_inserts += 1 - n_inserted_tokens += op.b2 - last_b2 + if isinstance(op, Insert): + n_inserts += 1 + n_inserted_tokens += op.b2 - op.b1 + last_insert_b2 = op.b2 last_b2 = op.b2 - assert n_inserted_tokens == last_b2 - initial_equal_tokens == 292 + assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293 assert n_inserts == 2 def test_paragraph_move(): @@ -195,6 +266,26 @@ def test_paragraph_move_and_change(): # so they reflect the state of the text according to the diff processor ops, a, b = diff_processor.process(rev1) ops, a, b = diff_processor.process(rev2) - assert_equal_enough(b, rev2) assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) +# slow test +def test_diff_consistency(): + from mwxml import Dump + stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout + + dump = Dump.from_file(stream) + for page in dump: + revisions = [rev.text for rev in page if rev.text] + + matcher = WikiDiffMatcher(revisions) + diff_processor = matcher.processor() + last_rev = "" + for rev in revisions: + print(rev, file=open("test_unicode_highlight_to",'w')) + print(last_rev, file=open("test_unicode_highlight_from",'w')) + ops, a, b = diff_processor.process(rev) + #assert_equal_enough(a, last_rev) + + assert_equal_enough(b, rev) + last_rev = rev diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py index 1399ffb..b7812dc 100644 --- a/wiki_diff_matcher.py +++ b/wiki_diff_matcher.py @@ -4,13 +4,15 @@ from collections import namedtuple from itertools import chain from typing import Dict, Generator, List, Optional, Tuple - import requests -from deltas import Equal, Insert, Delete, DiffEngine, Operation, RegexTokenizer, tokenizers +from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token, + RegexTokenizer, tokenizers) TOKENIZER = tokenizers.wikitext_split + def find_greatest_le_key(target_key, data_dict): + found_key = None for key in data_dict: # Iterates over keys in insertion order (which is sorted) if key <= target_key: found_key = ( @@ -62,6 +64,8 @@ class DiffToOperationMap: self.from_last_end_bytes = 0 self.from_last_to_bytes = 0 self.n_from_start_tokens = 0 + self.n_from_end_tokens = 0 + self.n_from_start_tokens = 0 self.n_to_start_tokens = 0 self.last_to_start_line = 0 self.last_from_start_line = 0 @@ -73,13 +77,35 @@ class DiffToOperationMap: self.to_byte_token_index_map: Dict[int, int] = {} self.par_move_dict = {} + # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. + self.to_linenumber_bytes_map = {} + def tokenize(self, bytes): return self.tokenizer.tokenize(bytes.decode("utf-8")) + def newline_result(self): + self.n_from_end_tokens += 1 + self.n_from_start_tokens += 1 + self.n_to_end_tokens += 1 + self.n_to_start_tokens +=1 + + return (Equal(self.n_from_start_tokens - 1, + self.n_from_end_tokens, + self.n_to_start_tokens - 1, + self.n_from_start_tokens), + [Token('\n')], + [Token('\n')]) + + def to_operations(self): parmoves = [] + [print(diff) for diff in self.diff["diff"][0:5]] for entry in self.diff["diff"]: offset = entry["offset"] + + if offset["to"]: + self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + text = entry["text"] # ignore empty diffs. They don't have any tokens if len(text) == 0: @@ -91,30 +117,58 @@ class DiffToOperationMap: if entry["type"] == 0: yield from self.doEqual(text, offset) + yield self.newline_result() # a line included in the 'to' revision, but not in the 'from' revision elif entry["type"] == 1: yield from self.doInsert(text, offset) + yield self.newline_result() # a line included in the 'from' revision, but not in the 'to' revision elif entry["type"] == 2: yield from self.doDelete(text, offset) + yield self.newline_result() elif entry["type"] == 3: yield from self.doHighlightRange( - text, entry["highlightRanges"], offset + text, entry["highlightRanges"], offset, entry["lineNumber"] ) + yield self.newline_result() elif entry["type"] == 4: self.par_move_dict[entry["moveInfo"]["id"]] = entry - # we need to count the tokens in the from revision so token index is correct - self.n_from_end_tokens += len(self.tokenize(entry['text'].encode())) - self.n_from_start_tokens += len(self.tokenize(entry['text'].encode())) + linkId = entry["moveInfo"]["linkId"] + if linkId in self.par_move_dict: + yield from self.doParMove(entry, self.par_move_dict[linkId]) + yield self.newline_result() + else: + # we need to count the tokens in the from revision so token index is correct + self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) + self.n_from_start_tokens += len( + self.tokenize(entry["text"].encode()) + ) elif entry["type"] == 5: - yield from self.doParMove(entry) - + linkId = entry["moveInfo"]["linkId"] + if linkId in self.par_move_dict: + yield from self.doParMove(self.par_move_dict[linkId], entry) + yield self.newline_result() + else: + self.par_move_dict[entry["moveInfo"]["id"]] = entry + # call doHighlightRange just to update the token indices + offset = { + "from": self.n_from_end_tokens, + "to": entry["offset"]["to"], + } + res = self.doHighlightRange( + entry["text"], + entry["highlightRanges"], + offset, + entry["lineNumber"], + update_idx="to", + ) + list(res) else: # The 'type' isn't one of the known raise ValueError(d) @@ -126,65 +180,84 @@ class DiffToOperationMap: # strictly increasing, while the "from" segments should merely be # non-overlapping. + def doEqual(self, equal_segment, offset, update_idx="all", type=str): + if type is str: + equal_bytes = equal_segment.encode() + elif type is bytes: + equal_bytes = equal_segment + else: + raise ValueError(equal_segment) - def doEqual(self, equal_text, offset): - equal_bytes = equal_text.encode() tokens = self.tokenize(equal_bytes) n_tokens = len(tokens) - self.n_from_end_tokens = self.n_from_start_tokens + n_tokens - self.n_to_end_tokens = self.n_to_start_tokens + n_tokens + n_from_end_tokens = self.n_from_start_tokens + n_tokens + n_to_end_tokens = self.n_to_start_tokens + n_tokens # we need to keep track of the to and from last end bytes self.from_last_end_bytes = offset["from"] + len(equal_bytes) self.to_last_end_bytes = offset["to"] + len(equal_bytes) yield ( Equal( self.n_from_start_tokens, - self.n_from_end_tokens, + n_from_end_tokens, self.n_to_start_tokens, - self.n_to_end_tokens, + n_to_end_tokens, ), tokens, tokens, ) - self.n_from_start_tokens += n_tokens - self.n_to_start_tokens += n_tokens - self.from_byte_token_index_map[ - offset['from'] - ] = self.n_from_end_tokens - self.to_byte_token_index_map[offset['to']] = self.n_to_end_tokens + if update_idx in ["from", "all"]: + self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens - def doInsert(self, insert_text, offset): - insert_bytes = insert_text.encode() + if update_idx in ["to", "all"]: + self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens + + self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens + self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens + + def doInsert(self, insert_segment, offset, update_idx="all", type=str): + if type is str: + insert_bytes = insert_segment.encode() + elif type is bytes: + insert_bytes = insert_segment + else: + raise ValueError(insert_segment) tokens = self.tokenize(insert_bytes) n_tokens = len(tokens) - self.n_to_end_tokens = self.n_to_start_tokens + n_tokens + n_to_end_tokens = self.n_to_start_tokens + n_tokens self.to_last_end_bytes = offset["to"] + len(insert_bytes) yield ( Insert( self.n_from_start_tokens, self.n_from_start_tokens, self.n_to_start_tokens, - self.n_to_end_tokens, + n_to_end_tokens, ), [], tokens, ) # We have now used more of the "to" tokens. - self.n_to_start_tokens += n_tokens + if update_idx in ["to", "all"]: + self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens - self.to_byte_token_index_map[offset['to']] = self.n_to_end_tokens + self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens - def doDelete(self, delete_text, offset): - delete_bytes = delete_text.encode() + def doDelete(self, delete_segment, offset, update_idx="all", type=str): + if type is str: + delete_bytes = delete_segment.encode() + elif type is bytes: + delete_bytes = delete_segment + else: + raise ValueError(delete_segment) tokens = self.tokenize(delete_bytes) n_tokens = len(tokens) - self.n_from_end_tokens = self.n_from_start_tokens + n_tokens + + n_from_end_tokens = self.n_from_start_tokens + n_tokens self.from_last_end_bytes = offset["from"] + len(delete_bytes) yield ( Delete( self.n_from_start_tokens, - self.n_from_end_tokens, + n_from_end_tokens, self.n_to_start_tokens, self.n_to_start_tokens, ), @@ -192,19 +265,30 @@ class DiffToOperationMap: [], ) # We have now used more of the "from" tokens. - self.n_from_start_tokens += n_tokens + if update_idx in ["from", "all"]: + self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens - self.from_byte_token_index_map[ - offset['from'] - ] = self.n_from_end_tokens + self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens - def doHighlightRange(self, highlight_bytes, highlightRanges, offset): + def doHighlightRange( + self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all" + ): # The text field is an overlapping mix of both the from and to, # so we need to handle it highlight-by-highlight. # there can be gaps between highlight segments. # for instance, if a word is deleted from the middle of a line. # we need to track that. + highlight_bytes = highlight_text.encode() highlight_end = 0 + + # it's possible for offset['to'] to be null. + # we can get it from the line number? + + if offset["to"] is None: + offset["to"] = self.from_byte_token_index_map[ + find_greatest_le_key(lineNumber, self.from_byte_token_index_map) + ] + highlight_offset = offset # note that diffs are token-level, but the indexes are byte-level @@ -214,7 +298,9 @@ class DiffToOperationMap: if highlight_start > highlight_end: equal_bytes = highlight_bytes[highlight_end:highlight_start] n_equal_bytes = len(equal_bytes) - yield from self.doEqual(equal_bytes, highlight_offset) + yield from self.doEqual( + equal_bytes, highlight_offset, update_idx=update_idx, type=bytes + ) highlight_offset["from"] += n_equal_bytes highlight_offset["to"] += n_equal_bytes @@ -222,11 +308,16 @@ class DiffToOperationMap: highlight_end = highlight_start + highlightRange["length"] range_bytes = highlight_bytes[highlight_start:highlight_end] n_range_bytes = len(range_bytes) + if highlightRange["type"] == 0: - yield from self.doInsert(range_bytes, highlight_offset) + yield from self.doInsert( + range_bytes, highlight_offset, update_idx=update_idx, type=bytes + ) highlight_offset["to"] += n_range_bytes elif highlightRange["type"] == 1: - yield from self.doDelete(range_bytes, highlight_offset) + yield from self.doDelete( + range_bytes, highlight_offset, update_idx=update_idx, type=bytes + ) highlight_offset["from"] += n_range_bytes else: raise Exception(entry) @@ -234,39 +325,48 @@ class DiffToOperationMap: # handle the rest of the line which is equal if highlight_end < len(highlight_bytes): range_bytes = highlight_bytes[highlight_end:] - yield from self.doEqual(range_bytes, highlight_offset) - - def doParMove(self, to_diff): + yield from self.doEqual(range_bytes, highlight_offset, type=bytes) + def doParMove(self, from_diff, to_diff): # the tricky part here is to put the tokens in the right spots. - from_diff = self.par_move_dict[to_diff["moveInfo"]["linkId"]] - from_byte_start = from_diff["offset"]["from"] # as of python 3.7 dictionaries are in insertion order. So # we can just find the first key that's greater # since the paragraph is removed in the "from" version, the index it is removed from - # will be *after* the - if from_byte_start >= self.from_last_end_bytes: # if the from paragraph is at the end - from_token_start = next(reversed(self.from_byte_token_index_map.values())) + # will be *after* the + if len(self.from_byte_token_index_map) > 0: + if ( + from_byte_start >= self.from_last_end_bytes + ): # if the from paragraph is at the end + from_token_start = next( + reversed(self.from_byte_token_index_map.values()) + ) + else: + key = find_greatest_le_key( + from_byte_start, self.from_byte_token_index_map + ) + from_token_start = self.from_byte_token_index_map[key] else: - key = find_greatest_le_key(from_byte_start, self.from_byte_token_index_map) - from_token_start = self.from_byte_token_index_map[key] + from_token_start = 0 - # get the to token index - to_byte_start = to_diff["offset"]["to"] - if to_byte_start >= self.to_last_end_bytes: - to_token_start = next(reversed(self.to_byte_token_index_map.values())) + if len(self.to_byte_token_index_map) > 0: + # get the to token index + to_byte_start = to_diff["offset"]["to"] + if to_byte_start >= self.to_last_end_bytes: + to_token_start = next(reversed(self.to_byte_token_index_map.values())) + else: + key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map) + to_token_start = self.to_byte_token_index_map[key] else: - key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map) - to_token_start = self.to_byte_token_index_map[key] + to_token_start = 0 # now we set the state and apply the highlights self.n_from_start_tokens = self.n_from_end_tokens = from_token_start self.n_to_start_tokens = self.n_to_end_tokens = to_token_start offset = {"from": from_byte_start, "to": to_byte_start} yield from self.doHighlightRange( - to_diff["text"], to_diff["highlightRanges"], offset + to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"] ) @@ -295,23 +395,32 @@ class WikiDiffMatcher: # The diff has already been computed, but we need to incrementally # retrieve it to recreate the behavior DiffState expects. diff = next(self.diffs) - diffToOperationsMapper = DiffToOperationMap( - diff, self.tokenizer - ) - ( + diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer) + + diffops = list(zip(*diffToOperationsMapper.to_operations())) + + if not diffops: + self.last_tokens = [] + return [], [], [] + + diffops = ( operations, aseq, bseq, - ) = list(zip(*diffToOperationsMapper.to_operations())) + ) = diffops aseq = list(aseq) - # aseq can be out of order, we need to sort it by a1 index. + # aseq/bseq can be out of order, we need to sort it by a1/b1 index. indices = list(range(len(aseq))) - indices.sort(key = lambda i: operations[i].a1) + indices.sort(key=lambda i: operations[i].a1) aseq = [aseq[i] for i in indices] - # bseq cannot be out of order since diffs are resolved in the order of aseq. + bseq = list(bseq) + indices = list(range(len(bseq))) + indices.sort(key=lambda i: operations[i].b1) + bseq = [bseq[i] for i in indices] + self.last_tokens = list(chain.from_iterable(aseq)) tokens = list(chain.from_iterable(bseq)) self.previous_text = text