compare pywikidiff2 to making requests to wikidiff2.

This commit is contained in:
Nathan TeBlunthuis 2025-07-07 10:51:11 -07:00
parent 4654911533
commit 14e819e565
6 changed files with 339 additions and 259 deletions

View File

@ -8,4 +8,5 @@ wikidiff2.initial_split_threshold = 0.1
wikidiff2.final_split_threshold = 0.6
; It is possible this limit will need to be larger for some pages.
post_max_size = 1000M
post_max_size = 10000M
opcache.enable=0

View File

@ -12,6 +12,7 @@ dependencies = [
"mwtypes>=0.4.0",
"mwxml>=0.3.6",
"pyarrow>=20.0.0",
"pywikidiff2",
"sortedcontainers>=2.4.0",
"yamlconf>=0.2.6",
]
@ -20,6 +21,7 @@ dependencies = [
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
deltas = { git = "https://github.com/groceryheist/deltas" }
pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidiff2" }
[dependency-groups]
dev = [
@ -27,4 +29,5 @@ dev = [
"pandas>=2.1.0",
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
"pytest-benchmark>=5.1.0",
]

View File

@ -1,2 +1,2 @@
#!/usr/bin/env bash
uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys
uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys

View File

@ -1,6 +1,7 @@
# start the server
import asyncio
import subprocess
from itertools import chain
from functools import partial
import re
import pytest
@ -8,14 +9,13 @@ import pytest_asyncio
from typing import List
from deltas import Delete, Equal, Insert, wikitext_split
from mwpersistence import Token
from wiki_diff_matcher import WikiDiffMatcher
@pytest_asyncio.fixture(scope="module", autouse=True)
@pytest_asyncio.fixture(scope="module", autouse=False)
async def start_stop_server():
print("starting server")
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
"wikidiff2_api.php",
"wikidiff2_api.php", "-c", "php.ini",
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# php needs a moment to actually start
@ -26,16 +26,24 @@ async def start_stop_server():
stdout, stderr = await proc.communicate()
print(stdout.decode())
print(stderr.decode())
def _replace_whitespace(match):
if match.group(1): # If spaces matched (e.g., ' ')
return ' '
elif match.group(2): # If newlines matched (e.g., '\n\n')
return '\n'
elif match.group(3): # If tabs matched (e.g., '\t\t')
return '\t'
return '' # Should not be reached if pattern is comprehensive
def assert_equal_enough(tokens:List[Token], rev):
# the tokens exclude newlines
# we allow extra whitespace at the beginning or end
token_doc = ''.join(str(t) for t in tokens)
token_doc = re.sub(r'\s+', ' ', token_doc).strip()
rev = re.sub(r'\s+', ' ', rev).strip()
print(token_doc, file = open('token','w'))
print(rev, file = open('rev','w'))
token_doc = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, token_doc).strip()
rev = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, rev).strip()
assert token_doc == rev
@ -136,6 +144,26 @@ def test_highlight_range():
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_unmatched_parmoves():
rev1 = open("test/test_diff_revisions/test_unmatched_parmoves_from").read()
rev2 = open("test/test_diff_revisions/test_unmatched_parmoves_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_bug_4():
rev1 = open("test/test_diff_revisions/test_bug_4_from").read()
rev2 = open("test/test_diff_revisions/test_bug_4_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(a, rev1)
assert_equal_enough(b, rev2)
def test_delete():
rev1 = open("test/test_diff_revisions/1295229484").read()
@ -295,18 +323,31 @@ def test_leading_whitespace():
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
# def test_whitespace_2():
# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
# matcher = WikiDiffMatcher([rev1,rev2])
# diff_processor = matcher.processor()
def test_whitespace_bug():
rev1 = open("test/test_diff_revisions/test_whitespace_bug_from").read()
rev2 = open("test/test_diff_revisions/test_whitespace_bug_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
# # note that a and b are constructed from the diffs.
# # so they reflect the state of the text according to the diff processor
# ops, a, b = diff_processor.process(rev1)
# ops, a, b = diff_processor.process(rev2)
# assert_equal_enough(b, rev2)
# assert_equal_enough(a, rev1)
# note that a and b are constructed from the diffs.
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
assert_equal_enough(a, rev1)
def test_bug_3():
rev1 = open("test/test_diff_revisions/test_bug_3_from").read()
rev2 = open("test/test_diff_revisions/test_bug_3_to").read()
matcher = WikiDiffMatcher([rev1,rev2])
diff_processor = matcher.processor()
# note that a and b are constructed from the diffs.
# so they reflect the state of the text according to the diff processor
ops, a, b = diff_processor.process(rev1)
ops, a, b = diff_processor.process(rev2)
assert_equal_enough(b, rev2)
#assert_equal_enough(a, rev1)
@ -326,15 +367,14 @@ def test_actually_equal():
assert_equal_enough(b, rev1)
assert_equal_enough(a, rev1)
# slow test
# slow test. comment out the following line to enable it.
@pytest.mark.skip
def test_diff_consistency():
from mwxml import Dump
stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout
dump = Dump.from_file(stream)
#stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout
dump = Dump.from_file("test/dumps/ikwiki.xml")
for page in dump:
revisions = [rev.text for rev in page if rev.text]
matcher = WikiDiffMatcher(revisions)
diff_processor = matcher.processor()
last_rev = ""
@ -342,7 +382,44 @@ def test_diff_consistency():
print(rev, file=open("test_unicode_highlight_to",'w'))
print(last_rev, file=open("test_unicode_highlight_from",'w'))
ops, a, b = diff_processor.process(rev)
#assert_equal_enough(a, last_rev)
assert_equal_enough(a, last_rev)
assert_equal_enough(b, rev)
last_rev = rev
#@pytest.mark.skip
def test_benchmark_diff(benchmark):
from mwxml import Dump
dump = Dump.from_file("test/dumps/ikwiki.xml")
revs = chain.from_iterable([rev.text for rev in page] for page in dump)
def next_revs():
return [next(revs), next(revs)], {}
benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1)
def test_benchmark_diff_server(start_stop_server,benchmark):
from mwxml import Dump
dump = Dump.from_file("test/dumps/ikwiki.xml")
revs = chain.from_iterable([rev.text for rev in page] for page in dump)
def next_revs():
return [next(revs), next(revs)], {'server':True}
benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1)
@pytest.mark.skip
def test_diff_consistency_server():
from mwxml import Dump
#stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout
dump = Dump.from_file("test/dumps/ikwiki.xml")
for page in dump:
revisions = [rev.text for rev in page if rev.text]
matcher = WikiDiffMatcher(revisions,server=True)
diff_processor = matcher.processor()
last_rev = ""
for rev in revisions:
print(rev, file=open("test_unicode_highlight_to",'w'))
print(last_rev, file=open("test_unicode_highlight_from",'w'))
ops, a, b = diff_processor.process(rev)
assert_equal_enough(a, last_rev)
assert_equal_enough(b, rev)
last_rev = rev

View File

@ -3,40 +3,18 @@ import sys
from collections import namedtuple
from itertools import chain
from typing import Dict, Generator, List, Optional, Tuple
from sortedcontainers import SortedDict
import requests
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
RegexTokenizer, Token, tokenizers)
from sortedcontainers import SortedDict
TOKENIZER = tokenizers.wikitext_split
import pywikidiff2
differ = pywikidiff2.pywikidiff2(numContextLines=1000000,
moved_paragraph_detection_cutoff=200000)
# def find_greatest_le_key(target_key, data_dict):
# found_key = None
# for key in data_dict: # Iterates over keys in insertion order (which is sorted)
# if key <= target_key:
# found_key = (
# key # This is the largest key found so far that satisfies the condition
# )
# else:
# # Since the dictionary is sorted, if key > target_key,
# # all subsequent keys will also be > target_key.
# return found_key or key
# def find_smallest_gt_key(target_key, data_dict):
# found_key = None
# for key in reversed(data_dict): # Iterates over keys in insertion order (which is sorted)
# if key >= target_key:
# found_key = (
# key # This is the largest key found so far that satisfies the condition
# )
# else:
# # Since the dictionary is sorted, if key > target_key,
# # all subsequent keys will also be > target_key.
# return found_key or key
def compute_diffs(url: str, texts: list[str]) -> list:
def compute_diffs_server(texts, url="http://127.0.0.1:8000"):
response = None
try:
response = requests.post(url, json=texts)
@ -63,166 +41,173 @@ def compute_diffs(url: str, texts: list[str]) -> list:
except requests.exceptions.RequestException as e:
print(f"An unexpected error occurred: {e}")
raise e
return incremental_diffs
def compute_diffs(texts: list[str]) -> list:
return differ.inline_json_diff_sequence(texts)
class DiffToOperationMap:
def __init__(self, diff, tokenizer):
self.tokenizer = tokenizer
self.diff = json.loads(diff)
# the code below is designed to work in bytes because that's how wikidiff2 indexes
# self.from_last_end_bytes = 0
# self.from_last_to_bytes = 0
# self.n_from_start_tokens = 0
# self.n_from_end_tokens = 0
# self.n_from_start_tokens = 0
# self.n_to_start_tokens = 0
# self.from_last_end_bytes = 0
# self.to_last_end_bytes = 0
# keeps track of the number of tokens seen so far
# to avoid repeated tokenization
# self.from_byte_token_index_map: SortedDict[int, int] = SortedDict()
# self.to_byte_token_index_map: SortedDict[int, int] = SortedDict()
self.par_move_dict = {}
self.from_par_move_dict = {}
self.to_par_move_dict = {}
self.highlights_without_offset = []
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
# def get_token_offset(self, byte_offset):
# from_token_start = None
# to_token_start = None
# from_last_end_bytes = self.from_byte_token_index_map.keys()[-1]
# to_last_end_bytes = self.to_byte_token_index_map.keys()[-1]
# if byte_offset['from'] is not None:
# if byte_offset['from'] < self.from_byte_token_index_map.values()[0]:
# from_token_start = 0
# else:
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from'])
# # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below?
# if key > from_last_end_bytes:
# from_token_start = self.from_byte_token_index_map[from_last_end_bytes]
# else:
# from_token_
# if byte_offset['to'] is not None:
# if byte_offset['to'] < self.to_byte_token_index_map.values()[0]:
# to_token_start = 0
# else:
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to'])
# if key >= from
# if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0:
# if (
# byte_offset['from'] >= self.from_last_end_bytes
# ): # if the from paragraph is at the end
# from_token_start = next(
# reversed(self.from_byte_token_index_map.values())
# )
# else:
# key = find_greatest_le_key(
# byte_offset['from'], self.from_byte_token_index_map
# )
# from_token_start = self.from_byte_token_index_map[key]
# else:
# from_token_start = 0
# to_offset = None
# if byte_offset['to'] is not None:
# if len(self.to_byte_token_index_map) > 0:
# if to_byte_start >= self.to_last_end_bytes:
# to_token_start = next(reversed(self.to_byte_token_index_map.values()))
# else:
# key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map)
# to_token_start = self.to_byte_token_index_map[key]
# else:
# to_token_start = 0
# return {'from': from_token_start,
# 'to': to_token_start}
def tokenize(self, bytes):
return self.tokenizer.tokenize(bytes.decode("utf-8"))
def to_operations(self):
for entry in self.diff["diff"]:
# add back the newline
entry["text"] += "\n"
text = entry["text"]
offset = entry["offset"]
if offset["from"] and entry.get("lineNumber") is not None :
if entry['type'] in [0, 2, 3, 4]:
self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode())
if offset["to"]:
if entry['type'] in [0, 1, 3, 5]:
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode())
# add back the newline
# this is the first byte of the line in the 'from' revision.
from_start_line = entry["offset"]["from"]
# this is the first byte of the line in the 'to' revision.
to_start_line = entry["offset"]["to"]
if entry["type"] == 0:
yield from self.doEqual(text, offset)
yield from self.doEqual(entry)
# a line included in the 'to' revision, but not in the 'from' revision
elif entry["type"] == 1:
yield from self.doInsert(text, offset)
yield from self.doInsert(entry)
# a line included in the 'from' revision, but not in the 'to' revision
elif entry["type"] == 2:
yield from self.doDelete(text, offset)
yield from self.doDelete(entry)
elif entry["type"] == 3:
yield from self.doHighlightRange(
text, entry["highlightRanges"], offset, entry["lineNumber"]
)
# sometimes, for some reason we don't have a 'to' index here. we'll save these for later
if entry["offset"]["to"] is None:
self.highlights_without_offset.append(entry)
else:
yield from self.doHighlightRange(entry)
elif entry["type"] == 4:
self.par_move_dict[entry["moveInfo"]["id"]] = entry
linkId = entry["moveInfo"]["linkId"]
if linkId in self.par_move_dict:
yield from self.doParMove(entry, self.par_move_dict[linkId])
# we need to count the tokens in the from revision so token index is correct
# self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
# self.n_from_start_tokens += len(
# self.tokenize(entry["text"].encode())
# )
if linkId in self.to_par_move_dict:
yield from self.doParMove(entry, self.to_par_move_dict.pop(linkId))
else:
self.from_par_move_dict[entry["moveInfo"]["id"]] = entry
elif entry["type"] == 5:
linkId = entry["moveInfo"]["linkId"]
if linkId in self.par_move_dict:
yield from self.doParMove(self.par_move_dict[linkId], entry)
if linkId in self.from_par_move_dict:
yield from self.doParMove(
self.from_par_move_dict.pop(linkId), entry
)
else:
self.par_move_dict[entry["moveInfo"]["id"]] = entry
# call doHighlightRange just to update the token indices
# offset = {
# "from": self.n_from_end_tokens,
# "to": entry["offset"]["to"],
# }
# res = self.doHighlightRange(
# entry["text"],
# entry["highlightRanges"],
# offset,
# entry["lineNumber"],
# update_idx="to",
# )
# list(res)
# self.n_to_end_tokens += len(self.tokenize(entry["text"].encode()))
# self.n_to_start_tokens += len(
# self.tokenize(entry["text"].encode())
# )
self.to_par_move_dict[entry["moveInfo"]["id"]] = entry
else:
# The 'type' isn't one of the known
raise ValueError(d)
# now we should be able to apply highlights
for entry in self.highlights_without_offset:
yield from self.doHighlightRange(entry)
if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
print("PROBLEM! Unmatched parmoves!")
print(self.from_par_move_dict)
print(self.to_par_move_dict)
# We can try to match them:
for lkey in self.from_par_move_dict.keys():
for rkey in self.to_par_move_dict.keys():
from_diff = self.from_par_move_dict[lkey]
to_diff = self.to_par_move_dict[rkey]
if self.match_parmoves_exact(from_diff, to_diff):
yield from self.doParMove(from_diff, to_diff)
del self.to_par_move_dict[lkey]
del self.from_par_move_dict[rkey]
break
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
# print("Couldn't find exact matches for all parmoves!")
# # we couldn't find all the matches via exact match
# # let's try matching based on line number instead
# lkeys_to_remove = []
# for lkey, from_diff in self.from_par_move_dict.items():
# from_linenum = from_diff["moveInfo"]["linkId"].split("_")[2]
# rkey_to_remove = None
# for rkey, to_diff in self.to_par_move_dict.items():
# to_linenum = rkey.split("_")[2]
# if from_linenum == to_linenum:
# print("Matching on line number")
# yield from self.doParMove(from_diff, to_diff)
# rkey_to_remove = rkey
# lkeys_to_remove.append(lkey)
# break
# if rkey_to_remove is not None:
# del self.to_par_move_dict[rkey_to_remove]
# for lkey in lkeys_to_remove:
# del self.from_par_move_dict[lkey]
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
# print("Couldn't find exact matches for all parmoves!")
# # we couldn't find all the matches via exact match or line number
# # let's try matching based on opIndex instead
# lkeys_to_remove = []
# for lkey, from_diff in self.from_par_move_dict.items():
# rkey_to_remove = None
# from_idx = from_diff["moveInfo"]["linkId"].split("_")[1]
# for rkey, to_diff in self.to_par_move_dict.items():
# to_idx = rkey.split("_")[1]
# print(from_idx)
# print(to_idx)
# if from_idx == to_idx:
# yield from self.doParMove(from_diff, to_diff)
# rkey_to_remove = rkey
# lkeys_to_remove.append(lkey)
# if rkey_to_remove is not None:
# del self.to_par_move_dict[rkey_to_remove]
# for lkey in lkeys_to_remove:
# del self.from_par_move_dict[lkey]
# we couldn't find matches. treat type 4 as removal and type 5 as highlight.
for from_diff in self.from_par_move_dict.values():
yield from self.doDelete(from_diff)
# only we don't know the from index; we assume its already handled.
for to_diff in self.to_par_move_dict.values():
offset["from"] = 0
offset["to"] = None
diffops = self.doHighlightRange(
{
"text": to_diff["text"],
"highlightRanges": to_diff["highlightRanges"],
'offset': offset,
'lineNumber': to_diff["lineNumber"],
}
)
diffops = [
(type(op)(None, None, op.b1, op.b2), [], bseq)
for op, _, bseq in diffops
if isinstance(op, Insert) or isinstance(op, Equal)
]
yield from diffops
def match_parmoves_exact(self, from_diff, to_diff):
ops, from_tokens, to_tokens = list(zip(*self.doParMove(from_diff, to_diff)))
from_text = "".join(chain.from_iterable(from_tokens))
# we know they match if we apply the highlight ranges and the "from" tokens equal the lhs tokens.
if from_text == from_diff["text"]:
print("MATCH FOUND")
return True
else:
print("NO MATCH")
print(len(from_text))
print(len(from_diff["text"]))
return False
# mwpersistence expects differences to be represented in order from the
# result's perspective ("to"), not the previous text. Thus, if a line
# is moved earlier then its insertion should appear before its deletion.
@ -230,12 +215,12 @@ class DiffToOperationMap:
# strictly increasing, while the "from" segments should merely be
# non-overlapping.
def doEqual(self, equal_segment, offset, update_idx="all"):
# if from_token_start is None:
# from_token_start = self.n_from_start_tokens
# if to_token_start is None:
# to_token_start = self.n_to_start_tokens
def doEqual(self, entry):
equal_segment, offset, lineNumber = (
entry["text"],
entry["offset"],
entry["lineNumber"],
)
if isinstance(equal_segment, str):
equal_bytes = equal_segment.encode()
elif isinstance(equal_segment, bytes):
@ -243,35 +228,28 @@ class DiffToOperationMap:
else:
raise ValueError(equal_segment)
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(equal_bytes)
self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(equal_bytes)
tokens = self.tokenize(equal_bytes)
n_tokens = len(tokens)
# token_offset = self.get_token_offset(offset)
# n_from_end_tokens = token_offset['from'] + n_tokens
# n_to_end_tokens = token_offset['to'] + n_tokens
yield (
Equal(
offset['from'],
offset["from"],
None,
offset['to'],
offset["to"],
None,
),
tokens,
tokens,
)
# if update_idx in ["from", "all"]:
# self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
# if update_idx in ["to", "all"]:
# self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
# self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
# self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
def doInsert(self, insert_segment, offset, update_idx="all"):
def doInsert(self, entry):
insert_segment, offset, lineNumber = (
entry["text"],
entry["offset"],
entry["lineNumber"],
)
if isinstance(insert_segment, str):
insert_bytes = insert_segment.encode()
elif isinstance(insert_segment, bytes):
@ -279,23 +257,24 @@ class DiffToOperationMap:
else:
raise ValueError(insert_segment)
tokens = self.tokenize(insert_bytes)
# n_tokens = len(tokens)
# token_offset = self.get_token_offset(offset)
# n_to_end_tokens = token_offset['to'] + n_tokens
self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(insert_bytes)
yield (
Insert(
None,
None,
offset['to'],
offset["to"],
None,
),
[],
tokens,
)
# We have now used more of the "to" tokens.
#self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
def doDelete(self, delete_segment, offset, update_idx="all", type=str):
def doDelete(self, entry):
delete_segment, offset, lineNumber = (
entry["text"],
entry["offset"],
entry.get("lineNumber", None),
)
if isinstance(delete_segment, str):
delete_bytes = delete_segment.encode()
elif isinstance(delete_segment, bytes):
@ -303,26 +282,22 @@ class DiffToOperationMap:
else:
raise ValueError(delete_segment)
tokens = self.tokenize(delete_bytes)
# n_tokens = len(tokens)
# token_offset = self.get_token_offset(offset)
# n_from_end_tokens = token_offset['from'] + n_tokens
if lineNumber is not None:
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes)
yield (
Delete(
offset['from'],
None,
None,
None
),
Delete(offset["from"], None, None, None),
tokens,
[],
)
#self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
def doHighlightRange(
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"):
def doHighlightRange(self, entry):
highlight_text, highlightRanges, offset, lineNumber = (
entry["text"],
entry["highlightRanges"],
entry["offset"],
entry["lineNumber"],
)
# The text field is an overlapping mix of both the from and to,
# so we need to handle it highlight-by-highlight.
@ -334,15 +309,22 @@ class DiffToOperationMap:
# it's possible for offset['to'] to be null.
# we can get it from the line number?
update_linenumber_map = True
# this bit is a little hacky as it deals with ideosyncratic wikidiff2 behavior
if offset["to"] is None:
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
if keyidx > 0:
print(self.to_linenumber_bytes_map)
key = self.to_linenumber_bytes_map.keys()[keyidx]
offset["to"] = self.to_linenumber_bytes_map[key]
# if the line already exists, we insert before it.
if lineNumber in self.to_linenumber_bytes_map:
keyidx = self.to_linenumber_bytes_map.bisect_left(lineNumber) - 1
else:
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
key = None
if keyidx == -1:
offset["to"] = 0
elif len(self.to_linenumber_bytes_map.keys()) > 0:
key = self.to_linenumber_bytes_map.keys()[keyidx]
else:
key = 0
if key is not None:
offset["to"] = self.to_linenumber_bytes_map.get(key, 0)
highlight_offset = offset
# note that diffs are token-level, but the indexes are byte-level
@ -353,13 +335,16 @@ class DiffToOperationMap:
if highlight_start > highlight_end:
equal_bytes = highlight_bytes[highlight_end:highlight_start]
n_equal_bytes = len(equal_bytes)
yield from self.doEqual(
equal_bytes, highlight_offset, update_idx=update_idx
{
"text": equal_bytes,
"offset": highlight_offset,
"lineNumber": lineNumber,
}
)
highlight_offset["from"] += n_equal_bytes
highlight_offset["to"] += n_equal_bytes
if update_linenumber_map:
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
# handle highlighted insert / delete
highlight_end = highlight_start + highlightRange["length"]
@ -368,14 +353,20 @@ class DiffToOperationMap:
if highlightRange["type"] == 0:
yield from self.doInsert(
range_bytes, highlight_offset, update_idx=update_idx
{
"text": range_bytes,
"offset": highlight_offset,
"lineNumber": lineNumber,
}
)
highlight_offset["to"] += n_range_bytes
if update_linenumber_map:
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
elif highlightRange["type"] == 1:
yield from self.doDelete(
range_bytes, highlight_offset, update_idx=update_idx
{
"text": range_bytes,
"offset": highlight_offset,
"lineNumber": lineNumber,
}
)
highlight_offset["from"] += n_range_bytes
else:
@ -384,16 +375,25 @@ class DiffToOperationMap:
# handle the rest of the line which is equal
if highlight_end < len(highlight_bytes):
range_bytes = highlight_bytes[highlight_end:]
yield from self.doEqual(range_bytes, highlight_offset)
yield from self.doEqual(
{
"text": range_bytes,
"offset": highlight_offset,
"lineNumber": lineNumber,
}
)
def doParMove(self, from_diff, to_diff):
# the tricky part here is to put the tokens in the right spots.
from_byte_start = from_diff["offset"]["from"]
to_byte_start = to_diff["offset"]["to"]
offset = {"from": from_byte_start, "to": to_byte_start}
# we need to cache the indexes; replace them; then restore
yield from self.doHighlightRange(
to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
{
"text": to_diff["text"],
"highlightRanges": to_diff["highlightRanges"],
'offset': offset,
'lineNumber': to_diff["lineNumber"],
}
)
@ -403,9 +403,13 @@ class WikiDiffMatcher:
texts: list[str] = None,
tokenizer: Optional[RegexTokenizer] = None,
url: Optional[str] = "http://127.0.0.1:8000",
server=False
):
# Pre-compute diffs to reduce traffic overhead.
self.diffs = compute_diffs(url, texts)
if server is True:
self.diffs = list(compute_diffs_server(list(texts),url))
else:
self.diffs = list(compute_diffs(list(texts)))
self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor):
@ -429,36 +433,33 @@ class WikiDiffMatcher:
# this happens when revisions are actually equal.
if len(diffops) == 0:
self.last_tokens = self.tokenizer.tokenize(text)
ops = [Equal(0, len(self.last_tokens),
0, len(self.last_tokens))]
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
return ops, self.last_tokens, self.last_tokens
# we get back the byte indices; now we transform to token indices
diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1))
aorder_ops = []
diffops.sort(
key=lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)
)
aorder_ops = []
token_offset = 0
_, aseq, _ = list(zip( * diffops))
_, aseq, _ = list(zip(*diffops))
for op, tokens, _ in diffops:
a1 = token_offset
if isinstance(op, Equal) or isinstance(op, Delete):
token_offset += len(tokens)
a2 = token_offset
aorder_ops.append(type(op)(a1,
a2,
op.b1,
op.b1))
aorder_ops.append(type(op)(a1, a2, op.b1, op.b1))
else:
aorder_ops.append(Insert(a1,
a1,
op.b1,
op.b1))
aorder_ops.append(Insert(a1, a1, op.b1, op.b1))
_, aseq, bseq = zip(* diffops)
_, aseq, bseq = zip(*diffops)
diffops = list(zip(aorder_ops, aseq, bseq))
diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1))
_, _, bseq = list(zip(* diffops))
diffops.sort(
key=lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)
)
_, _, bseq = list(zip(*diffops))
border_ops = []
token_offset = 0
for op, _, tokens in diffops:
@ -466,16 +467,10 @@ class WikiDiffMatcher:
if isinstance(op, Equal) or isinstance(op, Insert):
token_offset += len(tokens)
b2 = token_offset
border_ops.append(type(op)(op.a1,
op.a2,
b1,
b2))
border_ops.append(type(op)(op.a1, op.a2, b1, b2))
else:
border_ops.append(type(op)(op.a1,
op.a2,
b1,
b1))
border_ops.append(type(op)(op.a1, op.a2, b1, b1))
self.previous_text = text
self.last_tokens = list(chain.from_iterable(aseq))

View File

@ -1,5 +1,9 @@
<?php
header("Cache-Control: no-store, no-cache, must-revalidate, max-age=0");
header("Cache-Control: post-check=0, pre-check=0", false);
header("Pragma: no-cache");
// Launch this server with:
// php -S localhost:8000 -q -c php.ini