From 14e819e5650e7c09b97ab4d0cba777383b4218b0 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Mon, 7 Jul 2025 10:51:11 -0700 Subject: [PATCH] compare pywikidiff2 to making requests to wikidiff2. --- php.ini | 3 +- pyproject.toml | 3 + runtest.sh | 2 +- test/test_wiki_diff_matcher.py | 125 +++++++-- wiki_diff_matcher.py | 461 ++++++++++++++++----------------- wikidiff2_api.php | 4 + 6 files changed, 339 insertions(+), 259 deletions(-) diff --git a/php.ini b/php.ini index 1daa19c..f12e1d2 100644 --- a/php.ini +++ b/php.ini @@ -8,4 +8,5 @@ wikidiff2.initial_split_threshold = 0.1 wikidiff2.final_split_threshold = 0.6 ; It is possible this limit will need to be larger for some pages. -post_max_size = 1000M +post_max_size = 10000M +opcache.enable=0 diff --git a/pyproject.toml b/pyproject.toml index 4226341..19c04c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "mwtypes>=0.4.0", "mwxml>=0.3.6", "pyarrow>=20.0.0", + "pywikidiff2", "sortedcontainers>=2.4.0", "yamlconf>=0.2.6", ] @@ -20,6 +21,7 @@ dependencies = [ yamlconf = { git = "https://github.com/groceryheist/yamlconf" } mwxml = { git = "https://github.com/groceryheist/python-mwxml" } deltas = { git = "https://github.com/groceryheist/deltas" } +pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidiff2" } [dependency-groups] dev = [ @@ -27,4 +29,5 @@ dev = [ "pandas>=2.1.0", "pytest>=8.4.1", "pytest-asyncio>=1.0.0", + "pytest-benchmark>=5.1.0", ] diff --git a/runtest.sh b/runtest.sh index 6c2d25a..3f38038 100755 --- a/runtest.sh +++ b/runtest.sh @@ -1,2 +1,2 @@ #!/usr/bin/env bash -uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys +uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys diff --git a/test/test_wiki_diff_matcher.py b/test/test_wiki_diff_matcher.py index 97d8140..0592e12 100644 --- a/test/test_wiki_diff_matcher.py +++ b/test/test_wiki_diff_matcher.py @@ -1,6 +1,7 @@ # start the server import asyncio import subprocess +from itertools import chain from functools import partial import re import pytest @@ -8,14 +9,13 @@ import pytest_asyncio from typing import List from deltas import Delete, Equal, Insert, wikitext_split from mwpersistence import Token - from wiki_diff_matcher import WikiDiffMatcher -@pytest_asyncio.fixture(scope="module", autouse=True) +@pytest_asyncio.fixture(scope="module", autouse=False) async def start_stop_server(): print("starting server") proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000", - "wikidiff2_api.php", + "wikidiff2_api.php", "-c", "php.ini", stdout=subprocess.PIPE, stderr=subprocess.PIPE) # php needs a moment to actually start @@ -26,16 +26,24 @@ async def start_stop_server(): stdout, stderr = await proc.communicate() print(stdout.decode()) print(stderr.decode()) + +def _replace_whitespace(match): + if match.group(1): # If spaces matched (e.g., ' ') + return ' ' + elif match.group(2): # If newlines matched (e.g., '\n\n') + return '\n' + elif match.group(3): # If tabs matched (e.g., '\t\t') + return '\t' + return '' # Should not be reached if pattern is comprehensive def assert_equal_enough(tokens:List[Token], rev): # the tokens exclude newlines # we allow extra whitespace at the beginning or end token_doc = ''.join(str(t) for t in tokens) - token_doc = re.sub(r'\s+', ' ', token_doc).strip() - rev = re.sub(r'\s+', ' ', rev).strip() - print(token_doc, file = open('token','w')) print(rev, file = open('rev','w')) + token_doc = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, token_doc).strip() + rev = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, rev).strip() assert token_doc == rev @@ -136,6 +144,26 @@ def test_highlight_range(): assert_equal_enough(a, rev1) assert_equal_enough(b, rev2) +def test_unmatched_parmoves(): + rev1 = open("test/test_diff_revisions/test_unmatched_parmoves_from").read() + rev2 = open("test/test_diff_revisions/test_unmatched_parmoves_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) + +def test_bug_4(): + rev1 = open("test/test_diff_revisions/test_bug_4_from").read() + rev2 = open("test/test_diff_revisions/test_bug_4_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(a, rev1) + assert_equal_enough(b, rev2) + def test_delete(): rev1 = open("test/test_diff_revisions/1295229484").read() @@ -295,18 +323,31 @@ def test_leading_whitespace(): assert_equal_enough(b, rev2) assert_equal_enough(a, rev1) -# def test_whitespace_2(): -# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read() -# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read() -# matcher = WikiDiffMatcher([rev1,rev2]) -# diff_processor = matcher.processor() +def test_whitespace_bug(): + rev1 = open("test/test_diff_revisions/test_whitespace_bug_from").read() + rev2 = open("test/test_diff_revisions/test_whitespace_bug_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() -# # note that a and b are constructed from the diffs. -# # so they reflect the state of the text according to the diff processor -# ops, a, b = diff_processor.process(rev1) -# ops, a, b = diff_processor.process(rev2) -# assert_equal_enough(b, rev2) -# assert_equal_enough(a, rev1) + # note that a and b are constructed from the diffs. + # so they reflect the state of the text according to the diff processor + ops, a, b = diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(b, rev2) + assert_equal_enough(a, rev1) + +def test_bug_3(): + rev1 = open("test/test_diff_revisions/test_bug_3_from").read() + rev2 = open("test/test_diff_revisions/test_bug_3_to").read() + matcher = WikiDiffMatcher([rev1,rev2]) + diff_processor = matcher.processor() + + # note that a and b are constructed from the diffs. + # so they reflect the state of the text according to the diff processor + ops, a, b = diff_processor.process(rev1) + ops, a, b = diff_processor.process(rev2) + assert_equal_enough(b, rev2) + #assert_equal_enough(a, rev1) @@ -326,15 +367,14 @@ def test_actually_equal(): assert_equal_enough(b, rev1) assert_equal_enough(a, rev1) -# slow test +# slow test. comment out the following line to enable it. +@pytest.mark.skip def test_diff_consistency(): from mwxml import Dump - stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout - - dump = Dump.from_file(stream) + #stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout + dump = Dump.from_file("test/dumps/ikwiki.xml") for page in dump: revisions = [rev.text for rev in page if rev.text] - matcher = WikiDiffMatcher(revisions) diff_processor = matcher.processor() last_rev = "" @@ -342,7 +382,44 @@ def test_diff_consistency(): print(rev, file=open("test_unicode_highlight_to",'w')) print(last_rev, file=open("test_unicode_highlight_from",'w')) ops, a, b = diff_processor.process(rev) - #assert_equal_enough(a, last_rev) - + assert_equal_enough(a, last_rev) assert_equal_enough(b, rev) last_rev = rev + +#@pytest.mark.skip +def test_benchmark_diff(benchmark): + from mwxml import Dump + dump = Dump.from_file("test/dumps/ikwiki.xml") + revs = chain.from_iterable([rev.text for rev in page] for page in dump) + def next_revs(): + return [next(revs), next(revs)], {} + + benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1) + +def test_benchmark_diff_server(start_stop_server,benchmark): + from mwxml import Dump + dump = Dump.from_file("test/dumps/ikwiki.xml") + revs = chain.from_iterable([rev.text for rev in page] for page in dump) + def next_revs(): + return [next(revs), next(revs)], {'server':True} + + benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1) + +@pytest.mark.skip +def test_diff_consistency_server(): + from mwxml import Dump + #stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout + dump = Dump.from_file("test/dumps/ikwiki.xml") + for page in dump: + revisions = [rev.text for rev in page if rev.text] + matcher = WikiDiffMatcher(revisions,server=True) + diff_processor = matcher.processor() + last_rev = "" + for rev in revisions: + print(rev, file=open("test_unicode_highlight_to",'w')) + print(last_rev, file=open("test_unicode_highlight_from",'w')) + ops, a, b = diff_processor.process(rev) + assert_equal_enough(a, last_rev) + assert_equal_enough(b, rev) + last_rev = rev + diff --git a/wiki_diff_matcher.py b/wiki_diff_matcher.py index 8c5aa68..1ab5935 100644 --- a/wiki_diff_matcher.py +++ b/wiki_diff_matcher.py @@ -3,40 +3,18 @@ import sys from collections import namedtuple from itertools import chain from typing import Dict, Generator, List, Optional, Tuple -from sortedcontainers import SortedDict + import requests from deltas import (Delete, DiffEngine, Equal, Insert, Operation, RegexTokenizer, Token, tokenizers) +from sortedcontainers import SortedDict TOKENIZER = tokenizers.wikitext_split +import pywikidiff2 +differ = pywikidiff2.pywikidiff2(numContextLines=1000000, + moved_paragraph_detection_cutoff=200000) -# def find_greatest_le_key(target_key, data_dict): -# found_key = None -# for key in data_dict: # Iterates over keys in insertion order (which is sorted) -# if key <= target_key: -# found_key = ( -# key # This is the largest key found so far that satisfies the condition -# ) -# else: -# # Since the dictionary is sorted, if key > target_key, -# # all subsequent keys will also be > target_key. -# return found_key or key - -# def find_smallest_gt_key(target_key, data_dict): -# found_key = None -# for key in reversed(data_dict): # Iterates over keys in insertion order (which is sorted) -# if key >= target_key: -# found_key = ( -# key # This is the largest key found so far that satisfies the condition -# ) -# else: -# # Since the dictionary is sorted, if key > target_key, -# # all subsequent keys will also be > target_key. -# return found_key or key - - - -def compute_diffs(url: str, texts: list[str]) -> list: +def compute_diffs_server(texts, url="http://127.0.0.1:8000"): response = None try: response = requests.post(url, json=texts) @@ -63,166 +41,173 @@ def compute_diffs(url: str, texts: list[str]) -> list: except requests.exceptions.RequestException as e: print(f"An unexpected error occurred: {e}") raise e - return incremental_diffs + +def compute_diffs(texts: list[str]) -> list: + return differ.inline_json_diff_sequence(texts) class DiffToOperationMap: def __init__(self, diff, tokenizer): self.tokenizer = tokenizer self.diff = json.loads(diff) - - # the code below is designed to work in bytes because that's how wikidiff2 indexes - # self.from_last_end_bytes = 0 - # self.from_last_to_bytes = 0 - # self.n_from_start_tokens = 0 - # self.n_from_end_tokens = 0 - # self.n_from_start_tokens = 0 - # self.n_to_start_tokens = 0 - # self.from_last_end_bytes = 0 - # self.to_last_end_bytes = 0 - # keeps track of the number of tokens seen so far - # to avoid repeated tokenization - # self.from_byte_token_index_map: SortedDict[int, int] = SortedDict() - # self.to_byte_token_index_map: SortedDict[int, int] = SortedDict() - self.par_move_dict = {} - + self.from_par_move_dict = {} + self.to_par_move_dict = {} + self.highlights_without_offset = [] # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() - # def get_token_offset(self, byte_offset): - # from_token_start = None - # to_token_start = None - # from_last_end_bytes = self.from_byte_token_index_map.keys()[-1] - # to_last_end_bytes = self.to_byte_token_index_map.keys()[-1] - # if byte_offset['from'] is not None: - # if byte_offset['from'] < self.from_byte_token_index_map.values()[0]: - # from_token_start = 0 - # else: - # key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from']) - # # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below? - # if key > from_last_end_bytes: - # from_token_start = self.from_byte_token_index_map[from_last_end_bytes] - # else: - # from_token_ - # if byte_offset['to'] is not None: - # if byte_offset['to'] < self.to_byte_token_index_map.values()[0]: - # to_token_start = 0 - # else: - # key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to']) - # if key >= from - # if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0: - # if ( - # byte_offset['from'] >= self.from_last_end_bytes - # ): # if the from paragraph is at the end - # from_token_start = next( - # reversed(self.from_byte_token_index_map.values()) - # ) - # else: - # key = find_greatest_le_key( - # byte_offset['from'], self.from_byte_token_index_map - # ) - # from_token_start = self.from_byte_token_index_map[key] - # else: - # from_token_start = 0 - - # to_offset = None - # if byte_offset['to'] is not None: - # if len(self.to_byte_token_index_map) > 0: - # if to_byte_start >= self.to_last_end_bytes: - # to_token_start = next(reversed(self.to_byte_token_index_map.values())) - # else: - # key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map) - # to_token_start = self.to_byte_token_index_map[key] - # else: - # to_token_start = 0 - - # return {'from': from_token_start, - # 'to': to_token_start} def tokenize(self, bytes): return self.tokenizer.tokenize(bytes.decode("utf-8")) def to_operations(self): - for entry in self.diff["diff"]: - + # add back the newline entry["text"] += "\n" text = entry["text"] offset = entry["offset"] - if offset["from"] and entry.get("lineNumber") is not None : - if entry['type'] in [0, 2, 3, 4]: - self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode()) - - if offset["to"]: - if entry['type'] in [0, 1, 3, 5]: - self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode()) - - - # add back the newline - # this is the first byte of the line in the 'from' revision. from_start_line = entry["offset"]["from"] # this is the first byte of the line in the 'to' revision. to_start_line = entry["offset"]["to"] - + if entry["type"] == 0: - yield from self.doEqual(text, offset) + yield from self.doEqual(entry) # a line included in the 'to' revision, but not in the 'from' revision elif entry["type"] == 1: - yield from self.doInsert(text, offset) + yield from self.doInsert(entry) # a line included in the 'from' revision, but not in the 'to' revision elif entry["type"] == 2: - yield from self.doDelete(text, offset) + yield from self.doDelete(entry) elif entry["type"] == 3: - yield from self.doHighlightRange( - text, entry["highlightRanges"], offset, entry["lineNumber"] - ) + # sometimes, for some reason we don't have a 'to' index here. we'll save these for later + if entry["offset"]["to"] is None: + self.highlights_without_offset.append(entry) + else: + yield from self.doHighlightRange(entry) elif entry["type"] == 4: - self.par_move_dict[entry["moveInfo"]["id"]] = entry - linkId = entry["moveInfo"]["linkId"] - if linkId in self.par_move_dict: - yield from self.doParMove(entry, self.par_move_dict[linkId]) - # we need to count the tokens in the from revision so token index is correct - # self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) - # self.n_from_start_tokens += len( - # self.tokenize(entry["text"].encode()) - # ) - + if linkId in self.to_par_move_dict: + yield from self.doParMove(entry, self.to_par_move_dict.pop(linkId)) + else: + self.from_par_move_dict[entry["moveInfo"]["id"]] = entry + elif entry["type"] == 5: linkId = entry["moveInfo"]["linkId"] - if linkId in self.par_move_dict: - yield from self.doParMove(self.par_move_dict[linkId], entry) + if linkId in self.from_par_move_dict: + yield from self.doParMove( + self.from_par_move_dict.pop(linkId), entry + ) else: - self.par_move_dict[entry["moveInfo"]["id"]] = entry - # call doHighlightRange just to update the token indices - # offset = { - # "from": self.n_from_end_tokens, - # "to": entry["offset"]["to"], - # } - # res = self.doHighlightRange( - # entry["text"], - # entry["highlightRanges"], - # offset, - # entry["lineNumber"], - # update_idx="to", - # ) - # list(res) - # self.n_to_end_tokens += len(self.tokenize(entry["text"].encode())) - # self.n_to_start_tokens += len( - # self.tokenize(entry["text"].encode()) - # ) - + self.to_par_move_dict[entry["moveInfo"]["id"]] = entry else: # The 'type' isn't one of the known raise ValueError(d) + # now we should be able to apply highlights + + for entry in self.highlights_without_offset: + yield from self.doHighlightRange(entry) + + if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0: + print("PROBLEM! Unmatched parmoves!") + print(self.from_par_move_dict) + print(self.to_par_move_dict) + # We can try to match them: + for lkey in self.from_par_move_dict.keys(): + for rkey in self.to_par_move_dict.keys(): + from_diff = self.from_par_move_dict[lkey] + to_diff = self.to_par_move_dict[rkey] + if self.match_parmoves_exact(from_diff, to_diff): + yield from self.doParMove(from_diff, to_diff) + del self.to_par_move_dict[lkey] + del self.from_par_move_dict[rkey] + break + + # if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0: + # print("Couldn't find exact matches for all parmoves!") + # # we couldn't find all the matches via exact match + # # let's try matching based on line number instead + # lkeys_to_remove = [] + # for lkey, from_diff in self.from_par_move_dict.items(): + # from_linenum = from_diff["moveInfo"]["linkId"].split("_")[2] + # rkey_to_remove = None + # for rkey, to_diff in self.to_par_move_dict.items(): + # to_linenum = rkey.split("_")[2] + # if from_linenum == to_linenum: + # print("Matching on line number") + # yield from self.doParMove(from_diff, to_diff) + # rkey_to_remove = rkey + # lkeys_to_remove.append(lkey) + # break + # if rkey_to_remove is not None: + # del self.to_par_move_dict[rkey_to_remove] + # for lkey in lkeys_to_remove: + # del self.from_par_move_dict[lkey] + + # if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0: + # print("Couldn't find exact matches for all parmoves!") + # # we couldn't find all the matches via exact match or line number + # # let's try matching based on opIndex instead + # lkeys_to_remove = [] + # for lkey, from_diff in self.from_par_move_dict.items(): + # rkey_to_remove = None + # from_idx = from_diff["moveInfo"]["linkId"].split("_")[1] + # for rkey, to_diff in self.to_par_move_dict.items(): + # to_idx = rkey.split("_")[1] + # print(from_idx) + # print(to_idx) + # if from_idx == to_idx: + # yield from self.doParMove(from_diff, to_diff) + # rkey_to_remove = rkey + # lkeys_to_remove.append(lkey) + # if rkey_to_remove is not None: + # del self.to_par_move_dict[rkey_to_remove] + # for lkey in lkeys_to_remove: + # del self.from_par_move_dict[lkey] + + # we couldn't find matches. treat type 4 as removal and type 5 as highlight. + for from_diff in self.from_par_move_dict.values(): + yield from self.doDelete(from_diff) + + # only we don't know the from index; we assume its already handled. + for to_diff in self.to_par_move_dict.values(): + offset["from"] = 0 + offset["to"] = None + diffops = self.doHighlightRange( + { + "text": to_diff["text"], + "highlightRanges": to_diff["highlightRanges"], + 'offset': offset, + 'lineNumber': to_diff["lineNumber"], + } + ) + diffops = [ + (type(op)(None, None, op.b1, op.b2), [], bseq) + for op, _, bseq in diffops + if isinstance(op, Insert) or isinstance(op, Equal) + ] + yield from diffops + + def match_parmoves_exact(self, from_diff, to_diff): + ops, from_tokens, to_tokens = list(zip(*self.doParMove(from_diff, to_diff))) + from_text = "".join(chain.from_iterable(from_tokens)) + # we know they match if we apply the highlight ranges and the "from" tokens equal the lhs tokens. + if from_text == from_diff["text"]: + print("MATCH FOUND") + return True + else: + print("NO MATCH") + print(len(from_text)) + print(len(from_diff["text"])) + return False + # mwpersistence expects differences to be represented in order from the # result's perspective ("to"), not the previous text. Thus, if a line # is moved earlier then its insertion should appear before its deletion. @@ -230,12 +215,12 @@ class DiffToOperationMap: # strictly increasing, while the "from" segments should merely be # non-overlapping. - def doEqual(self, equal_segment, offset, update_idx="all"): - # if from_token_start is None: - # from_token_start = self.n_from_start_tokens - # if to_token_start is None: - # to_token_start = self.n_to_start_tokens - + def doEqual(self, entry): + equal_segment, offset, lineNumber = ( + entry["text"], + entry["offset"], + entry["lineNumber"], + ) if isinstance(equal_segment, str): equal_bytes = equal_segment.encode() elif isinstance(equal_segment, bytes): @@ -243,35 +228,28 @@ class DiffToOperationMap: else: raise ValueError(equal_segment) + self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(equal_bytes) + self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(equal_bytes) + tokens = self.tokenize(equal_bytes) n_tokens = len(tokens) - - # token_offset = self.get_token_offset(offset) - - # n_from_end_tokens = token_offset['from'] + n_tokens - # n_to_end_tokens = token_offset['to'] + n_tokens - yield ( Equal( - offset['from'], + offset["from"], None, - offset['to'], + offset["to"], None, ), tokens, tokens, ) - # if update_idx in ["from", "all"]: - # self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens - - # if update_idx in ["to", "all"]: - # self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens - - # self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens - # self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens - - def doInsert(self, insert_segment, offset, update_idx="all"): + def doInsert(self, entry): + insert_segment, offset, lineNumber = ( + entry["text"], + entry["offset"], + entry["lineNumber"], + ) if isinstance(insert_segment, str): insert_bytes = insert_segment.encode() elif isinstance(insert_segment, bytes): @@ -279,23 +257,24 @@ class DiffToOperationMap: else: raise ValueError(insert_segment) tokens = self.tokenize(insert_bytes) - # n_tokens = len(tokens) - # token_offset = self.get_token_offset(offset) - # n_to_end_tokens = token_offset['to'] + n_tokens + self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(insert_bytes) yield ( Insert( None, None, - offset['to'], + offset["to"], None, ), [], tokens, ) - # We have now used more of the "to" tokens. - #self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens - def doDelete(self, delete_segment, offset, update_idx="all", type=str): + def doDelete(self, entry): + delete_segment, offset, lineNumber = ( + entry["text"], + entry["offset"], + entry.get("lineNumber", None), + ) if isinstance(delete_segment, str): delete_bytes = delete_segment.encode() elif isinstance(delete_segment, bytes): @@ -303,26 +282,22 @@ class DiffToOperationMap: else: raise ValueError(delete_segment) tokens = self.tokenize(delete_bytes) - # n_tokens = len(tokens) - - # token_offset = self.get_token_offset(offset) - # n_from_end_tokens = token_offset['from'] + n_tokens + if lineNumber is not None: + self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes) yield ( - Delete( - offset['from'], - None, - None, - None - ), + Delete(offset["from"], None, None, None), tokens, [], ) - #self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens - - def doHighlightRange( - self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"): + def doHighlightRange(self, entry): + highlight_text, highlightRanges, offset, lineNumber = ( + entry["text"], + entry["highlightRanges"], + entry["offset"], + entry["lineNumber"], + ) # The text field is an overlapping mix of both the from and to, # so we need to handle it highlight-by-highlight. @@ -334,15 +309,22 @@ class DiffToOperationMap: # it's possible for offset['to'] to be null. # we can get it from the line number? - update_linenumber_map = True + # this bit is a little hacky as it deals with ideosyncratic wikidiff2 behavior if offset["to"] is None: - keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1 - if keyidx > 0: - print(self.to_linenumber_bytes_map) - key = self.to_linenumber_bytes_map.keys()[keyidx] - offset["to"] = self.to_linenumber_bytes_map[key] + # if the line already exists, we insert before it. + if lineNumber in self.to_linenumber_bytes_map: + keyidx = self.to_linenumber_bytes_map.bisect_left(lineNumber) - 1 else: + keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1 + key = None + if keyidx == -1: offset["to"] = 0 + elif len(self.to_linenumber_bytes_map.keys()) > 0: + key = self.to_linenumber_bytes_map.keys()[keyidx] + else: + key = 0 + if key is not None: + offset["to"] = self.to_linenumber_bytes_map.get(key, 0) highlight_offset = offset # note that diffs are token-level, but the indexes are byte-level @@ -353,13 +335,16 @@ class DiffToOperationMap: if highlight_start > highlight_end: equal_bytes = highlight_bytes[highlight_end:highlight_start] n_equal_bytes = len(equal_bytes) + yield from self.doEqual( - equal_bytes, highlight_offset, update_idx=update_idx + { + "text": equal_bytes, + "offset": highlight_offset, + "lineNumber": lineNumber, + } ) highlight_offset["from"] += n_equal_bytes highlight_offset["to"] += n_equal_bytes - if update_linenumber_map: - self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] # handle highlighted insert / delete highlight_end = highlight_start + highlightRange["length"] @@ -368,14 +353,20 @@ class DiffToOperationMap: if highlightRange["type"] == 0: yield from self.doInsert( - range_bytes, highlight_offset, update_idx=update_idx + { + "text": range_bytes, + "offset": highlight_offset, + "lineNumber": lineNumber, + } ) highlight_offset["to"] += n_range_bytes - if update_linenumber_map: - self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] elif highlightRange["type"] == 1: yield from self.doDelete( - range_bytes, highlight_offset, update_idx=update_idx + { + "text": range_bytes, + "offset": highlight_offset, + "lineNumber": lineNumber, + } ) highlight_offset["from"] += n_range_bytes else: @@ -384,16 +375,25 @@ class DiffToOperationMap: # handle the rest of the line which is equal if highlight_end < len(highlight_bytes): range_bytes = highlight_bytes[highlight_end:] - yield from self.doEqual(range_bytes, highlight_offset) + yield from self.doEqual( + { + "text": range_bytes, + "offset": highlight_offset, + "lineNumber": lineNumber, + } + ) def doParMove(self, from_diff, to_diff): - # the tricky part here is to put the tokens in the right spots. from_byte_start = from_diff["offset"]["from"] to_byte_start = to_diff["offset"]["to"] offset = {"from": from_byte_start, "to": to_byte_start} - # we need to cache the indexes; replace them; then restore yield from self.doHighlightRange( - to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"] + { + "text": to_diff["text"], + "highlightRanges": to_diff["highlightRanges"], + 'offset': offset, + 'lineNumber': to_diff["lineNumber"], + } ) @@ -403,9 +403,13 @@ class WikiDiffMatcher: texts: list[str] = None, tokenizer: Optional[RegexTokenizer] = None, url: Optional[str] = "http://127.0.0.1:8000", + server=False ): # Pre-compute diffs to reduce traffic overhead. - self.diffs = compute_diffs(url, texts) + if server is True: + self.diffs = list(compute_diffs_server(list(texts),url)) + else: + self.diffs = list(compute_diffs(list(texts))) self.tokenizer = tokenizer or TOKENIZER class Processor(DiffEngine.Processor): @@ -429,36 +433,33 @@ class WikiDiffMatcher: # this happens when revisions are actually equal. if len(diffops) == 0: self.last_tokens = self.tokenizer.tokenize(text) - ops = [Equal(0, len(self.last_tokens), - 0, len(self.last_tokens))] + ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))] return ops, self.last_tokens, self.last_tokens # we get back the byte indices; now we transform to token indices - diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)) - aorder_ops = [] + diffops.sort( + key=lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1) + ) + aorder_ops = [] token_offset = 0 - _, aseq, _ = list(zip( * diffops)) + _, aseq, _ = list(zip(*diffops)) for op, tokens, _ in diffops: a1 = token_offset if isinstance(op, Equal) or isinstance(op, Delete): token_offset += len(tokens) a2 = token_offset - aorder_ops.append(type(op)(a1, - a2, - op.b1, - op.b1)) + aorder_ops.append(type(op)(a1, a2, op.b1, op.b1)) else: - aorder_ops.append(Insert(a1, - a1, - op.b1, - op.b1)) + aorder_ops.append(Insert(a1, a1, op.b1, op.b1)) - _, aseq, bseq = zip(* diffops) + _, aseq, bseq = zip(*diffops) diffops = list(zip(aorder_ops, aseq, bseq)) - diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)) - _, _, bseq = list(zip(* diffops)) + diffops.sort( + key=lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1) + ) + _, _, bseq = list(zip(*diffops)) border_ops = [] token_offset = 0 for op, _, tokens in diffops: @@ -466,16 +467,10 @@ class WikiDiffMatcher: if isinstance(op, Equal) or isinstance(op, Insert): token_offset += len(tokens) b2 = token_offset - border_ops.append(type(op)(op.a1, - op.a2, - b1, - b2)) + border_ops.append(type(op)(op.a1, op.a2, b1, b2)) else: - border_ops.append(type(op)(op.a1, - op.a2, - b1, - b1)) - + border_ops.append(type(op)(op.a1, op.a2, b1, b1)) + self.previous_text = text self.last_tokens = list(chain.from_iterable(aseq)) diff --git a/wikidiff2_api.php b/wikidiff2_api.php index a588b1c..0ee5add 100644 --- a/wikidiff2_api.php +++ b/wikidiff2_api.php @@ -1,5 +1,9 @@