compare pywikidiff2 to making requests to wikidiff2.
This commit is contained in:
parent
4654911533
commit
14e819e565
3
php.ini
3
php.ini
@ -8,4 +8,5 @@ wikidiff2.initial_split_threshold = 0.1
|
|||||||
wikidiff2.final_split_threshold = 0.6
|
wikidiff2.final_split_threshold = 0.6
|
||||||
|
|
||||||
; It is possible this limit will need to be larger for some pages.
|
; It is possible this limit will need to be larger for some pages.
|
||||||
post_max_size = 1000M
|
post_max_size = 10000M
|
||||||
|
opcache.enable=0
|
||||||
|
@ -12,6 +12,7 @@ dependencies = [
|
|||||||
"mwtypes>=0.4.0",
|
"mwtypes>=0.4.0",
|
||||||
"mwxml>=0.3.6",
|
"mwxml>=0.3.6",
|
||||||
"pyarrow>=20.0.0",
|
"pyarrow>=20.0.0",
|
||||||
|
"pywikidiff2",
|
||||||
"sortedcontainers>=2.4.0",
|
"sortedcontainers>=2.4.0",
|
||||||
"yamlconf>=0.2.6",
|
"yamlconf>=0.2.6",
|
||||||
]
|
]
|
||||||
@ -20,6 +21,7 @@ dependencies = [
|
|||||||
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
yamlconf = { git = "https://github.com/groceryheist/yamlconf" }
|
||||||
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
mwxml = { git = "https://github.com/groceryheist/python-mwxml" }
|
||||||
deltas = { git = "https://github.com/groceryheist/deltas" }
|
deltas = { git = "https://github.com/groceryheist/deltas" }
|
||||||
|
pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidiff2" }
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
@ -27,4 +29,5 @@ dev = [
|
|||||||
"pandas>=2.1.0",
|
"pandas>=2.1.0",
|
||||||
"pytest>=8.4.1",
|
"pytest>=8.4.1",
|
||||||
"pytest-asyncio>=1.0.0",
|
"pytest-asyncio>=1.0.0",
|
||||||
|
"pytest-benchmark>=5.1.0",
|
||||||
]
|
]
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys
|
uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
# start the server
|
# start the server
|
||||||
import asyncio
|
import asyncio
|
||||||
import subprocess
|
import subprocess
|
||||||
|
from itertools import chain
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import re
|
import re
|
||||||
import pytest
|
import pytest
|
||||||
@ -8,14 +9,13 @@ import pytest_asyncio
|
|||||||
from typing import List
|
from typing import List
|
||||||
from deltas import Delete, Equal, Insert, wikitext_split
|
from deltas import Delete, Equal, Insert, wikitext_split
|
||||||
from mwpersistence import Token
|
from mwpersistence import Token
|
||||||
|
|
||||||
from wiki_diff_matcher import WikiDiffMatcher
|
from wiki_diff_matcher import WikiDiffMatcher
|
||||||
|
|
||||||
@pytest_asyncio.fixture(scope="module", autouse=True)
|
@pytest_asyncio.fixture(scope="module", autouse=False)
|
||||||
async def start_stop_server():
|
async def start_stop_server():
|
||||||
print("starting server")
|
print("starting server")
|
||||||
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
|
proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000",
|
||||||
"wikidiff2_api.php",
|
"wikidiff2_api.php", "-c", "php.ini",
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE)
|
stderr=subprocess.PIPE)
|
||||||
# php needs a moment to actually start
|
# php needs a moment to actually start
|
||||||
@ -26,16 +26,24 @@ async def start_stop_server():
|
|||||||
stdout, stderr = await proc.communicate()
|
stdout, stderr = await proc.communicate()
|
||||||
print(stdout.decode())
|
print(stdout.decode())
|
||||||
print(stderr.decode())
|
print(stderr.decode())
|
||||||
|
|
||||||
|
def _replace_whitespace(match):
|
||||||
|
if match.group(1): # If spaces matched (e.g., ' ')
|
||||||
|
return ' '
|
||||||
|
elif match.group(2): # If newlines matched (e.g., '\n\n')
|
||||||
|
return '\n'
|
||||||
|
elif match.group(3): # If tabs matched (e.g., '\t\t')
|
||||||
|
return '\t'
|
||||||
|
return '' # Should not be reached if pattern is comprehensive
|
||||||
|
|
||||||
def assert_equal_enough(tokens:List[Token], rev):
|
def assert_equal_enough(tokens:List[Token], rev):
|
||||||
# the tokens exclude newlines
|
# the tokens exclude newlines
|
||||||
# we allow extra whitespace at the beginning or end
|
# we allow extra whitespace at the beginning or end
|
||||||
token_doc = ''.join(str(t) for t in tokens)
|
token_doc = ''.join(str(t) for t in tokens)
|
||||||
token_doc = re.sub(r'\s+', ' ', token_doc).strip()
|
|
||||||
rev = re.sub(r'\s+', ' ', rev).strip()
|
|
||||||
|
|
||||||
print(token_doc, file = open('token','w'))
|
print(token_doc, file = open('token','w'))
|
||||||
print(rev, file = open('rev','w'))
|
print(rev, file = open('rev','w'))
|
||||||
|
token_doc = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, token_doc).strip()
|
||||||
|
rev = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, rev).strip()
|
||||||
assert token_doc == rev
|
assert token_doc == rev
|
||||||
|
|
||||||
|
|
||||||
@ -136,6 +144,26 @@ def test_highlight_range():
|
|||||||
assert_equal_enough(a, rev1)
|
assert_equal_enough(a, rev1)
|
||||||
assert_equal_enough(b, rev2)
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
def test_unmatched_parmoves():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_unmatched_parmoves_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_unmatched_parmoves_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
def test_bug_4():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_bug_4_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_bug_4_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(a, rev1)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
|
||||||
|
|
||||||
def test_delete():
|
def test_delete():
|
||||||
rev1 = open("test/test_diff_revisions/1295229484").read()
|
rev1 = open("test/test_diff_revisions/1295229484").read()
|
||||||
@ -295,18 +323,31 @@ def test_leading_whitespace():
|
|||||||
assert_equal_enough(b, rev2)
|
assert_equal_enough(b, rev2)
|
||||||
assert_equal_enough(a, rev1)
|
assert_equal_enough(a, rev1)
|
||||||
|
|
||||||
# def test_whitespace_2():
|
def test_whitespace_bug():
|
||||||
# rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read()
|
rev1 = open("test/test_diff_revisions/test_whitespace_bug_from").read()
|
||||||
# rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read()
|
rev2 = open("test/test_diff_revisions/test_whitespace_bug_to").read()
|
||||||
# matcher = WikiDiffMatcher([rev1,rev2])
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
# diff_processor = matcher.processor()
|
diff_processor = matcher.processor()
|
||||||
|
|
||||||
# # note that a and b are constructed from the diffs.
|
# note that a and b are constructed from the diffs.
|
||||||
# # so they reflect the state of the text according to the diff processor
|
# so they reflect the state of the text according to the diff processor
|
||||||
# ops, a, b = diff_processor.process(rev1)
|
ops, a, b = diff_processor.process(rev1)
|
||||||
# ops, a, b = diff_processor.process(rev2)
|
ops, a, b = diff_processor.process(rev2)
|
||||||
# assert_equal_enough(b, rev2)
|
assert_equal_enough(b, rev2)
|
||||||
# assert_equal_enough(a, rev1)
|
assert_equal_enough(a, rev1)
|
||||||
|
|
||||||
|
def test_bug_3():
|
||||||
|
rev1 = open("test/test_diff_revisions/test_bug_3_from").read()
|
||||||
|
rev2 = open("test/test_diff_revisions/test_bug_3_to").read()
|
||||||
|
matcher = WikiDiffMatcher([rev1,rev2])
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
|
||||||
|
# note that a and b are constructed from the diffs.
|
||||||
|
# so they reflect the state of the text according to the diff processor
|
||||||
|
ops, a, b = diff_processor.process(rev1)
|
||||||
|
ops, a, b = diff_processor.process(rev2)
|
||||||
|
assert_equal_enough(b, rev2)
|
||||||
|
#assert_equal_enough(a, rev1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -326,15 +367,14 @@ def test_actually_equal():
|
|||||||
assert_equal_enough(b, rev1)
|
assert_equal_enough(b, rev1)
|
||||||
assert_equal_enough(a, rev1)
|
assert_equal_enough(a, rev1)
|
||||||
|
|
||||||
# slow test
|
# slow test. comment out the following line to enable it.
|
||||||
|
@pytest.mark.skip
|
||||||
def test_diff_consistency():
|
def test_diff_consistency():
|
||||||
from mwxml import Dump
|
from mwxml import Dump
|
||||||
stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout
|
#stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout
|
||||||
|
dump = Dump.from_file("test/dumps/ikwiki.xml")
|
||||||
dump = Dump.from_file(stream)
|
|
||||||
for page in dump:
|
for page in dump:
|
||||||
revisions = [rev.text for rev in page if rev.text]
|
revisions = [rev.text for rev in page if rev.text]
|
||||||
|
|
||||||
matcher = WikiDiffMatcher(revisions)
|
matcher = WikiDiffMatcher(revisions)
|
||||||
diff_processor = matcher.processor()
|
diff_processor = matcher.processor()
|
||||||
last_rev = ""
|
last_rev = ""
|
||||||
@ -342,7 +382,44 @@ def test_diff_consistency():
|
|||||||
print(rev, file=open("test_unicode_highlight_to",'w'))
|
print(rev, file=open("test_unicode_highlight_to",'w'))
|
||||||
print(last_rev, file=open("test_unicode_highlight_from",'w'))
|
print(last_rev, file=open("test_unicode_highlight_from",'w'))
|
||||||
ops, a, b = diff_processor.process(rev)
|
ops, a, b = diff_processor.process(rev)
|
||||||
#assert_equal_enough(a, last_rev)
|
assert_equal_enough(a, last_rev)
|
||||||
|
|
||||||
assert_equal_enough(b, rev)
|
assert_equal_enough(b, rev)
|
||||||
last_rev = rev
|
last_rev = rev
|
||||||
|
|
||||||
|
#@pytest.mark.skip
|
||||||
|
def test_benchmark_diff(benchmark):
|
||||||
|
from mwxml import Dump
|
||||||
|
dump = Dump.from_file("test/dumps/ikwiki.xml")
|
||||||
|
revs = chain.from_iterable([rev.text for rev in page] for page in dump)
|
||||||
|
def next_revs():
|
||||||
|
return [next(revs), next(revs)], {}
|
||||||
|
|
||||||
|
benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1)
|
||||||
|
|
||||||
|
def test_benchmark_diff_server(start_stop_server,benchmark):
|
||||||
|
from mwxml import Dump
|
||||||
|
dump = Dump.from_file("test/dumps/ikwiki.xml")
|
||||||
|
revs = chain.from_iterable([rev.text for rev in page] for page in dump)
|
||||||
|
def next_revs():
|
||||||
|
return [next(revs), next(revs)], {'server':True}
|
||||||
|
|
||||||
|
benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1)
|
||||||
|
|
||||||
|
@pytest.mark.skip
|
||||||
|
def test_diff_consistency_server():
|
||||||
|
from mwxml import Dump
|
||||||
|
#stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout
|
||||||
|
dump = Dump.from_file("test/dumps/ikwiki.xml")
|
||||||
|
for page in dump:
|
||||||
|
revisions = [rev.text for rev in page if rev.text]
|
||||||
|
matcher = WikiDiffMatcher(revisions,server=True)
|
||||||
|
diff_processor = matcher.processor()
|
||||||
|
last_rev = ""
|
||||||
|
for rev in revisions:
|
||||||
|
print(rev, file=open("test_unicode_highlight_to",'w'))
|
||||||
|
print(last_rev, file=open("test_unicode_highlight_from",'w'))
|
||||||
|
ops, a, b = diff_processor.process(rev)
|
||||||
|
assert_equal_enough(a, last_rev)
|
||||||
|
assert_equal_enough(b, rev)
|
||||||
|
last_rev = rev
|
||||||
|
|
||||||
|
@ -3,40 +3,18 @@ import sys
|
|||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Dict, Generator, List, Optional, Tuple
|
from typing import Dict, Generator, List, Optional, Tuple
|
||||||
from sortedcontainers import SortedDict
|
|
||||||
import requests
|
import requests
|
||||||
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
from deltas import (Delete, DiffEngine, Equal, Insert, Operation,
|
||||||
RegexTokenizer, Token, tokenizers)
|
RegexTokenizer, Token, tokenizers)
|
||||||
|
from sortedcontainers import SortedDict
|
||||||
|
|
||||||
TOKENIZER = tokenizers.wikitext_split
|
TOKENIZER = tokenizers.wikitext_split
|
||||||
|
import pywikidiff2
|
||||||
|
differ = pywikidiff2.pywikidiff2(numContextLines=1000000,
|
||||||
|
moved_paragraph_detection_cutoff=200000)
|
||||||
|
|
||||||
# def find_greatest_le_key(target_key, data_dict):
|
def compute_diffs_server(texts, url="http://127.0.0.1:8000"):
|
||||||
# found_key = None
|
|
||||||
# for key in data_dict: # Iterates over keys in insertion order (which is sorted)
|
|
||||||
# if key <= target_key:
|
|
||||||
# found_key = (
|
|
||||||
# key # This is the largest key found so far that satisfies the condition
|
|
||||||
# )
|
|
||||||
# else:
|
|
||||||
# # Since the dictionary is sorted, if key > target_key,
|
|
||||||
# # all subsequent keys will also be > target_key.
|
|
||||||
# return found_key or key
|
|
||||||
|
|
||||||
# def find_smallest_gt_key(target_key, data_dict):
|
|
||||||
# found_key = None
|
|
||||||
# for key in reversed(data_dict): # Iterates over keys in insertion order (which is sorted)
|
|
||||||
# if key >= target_key:
|
|
||||||
# found_key = (
|
|
||||||
# key # This is the largest key found so far that satisfies the condition
|
|
||||||
# )
|
|
||||||
# else:
|
|
||||||
# # Since the dictionary is sorted, if key > target_key,
|
|
||||||
# # all subsequent keys will also be > target_key.
|
|
||||||
# return found_key or key
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def compute_diffs(url: str, texts: list[str]) -> list:
|
|
||||||
response = None
|
response = None
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, json=texts)
|
response = requests.post(url, json=texts)
|
||||||
@ -63,166 +41,173 @@ def compute_diffs(url: str, texts: list[str]) -> list:
|
|||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
print(f"An unexpected error occurred: {e}")
|
print(f"An unexpected error occurred: {e}")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
return incremental_diffs
|
return incremental_diffs
|
||||||
|
|
||||||
|
|
||||||
|
def compute_diffs(texts: list[str]) -> list:
|
||||||
|
return differ.inline_json_diff_sequence(texts)
|
||||||
|
|
||||||
class DiffToOperationMap:
|
class DiffToOperationMap:
|
||||||
def __init__(self, diff, tokenizer):
|
def __init__(self, diff, tokenizer):
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.diff = json.loads(diff)
|
self.diff = json.loads(diff)
|
||||||
|
self.from_par_move_dict = {}
|
||||||
# the code below is designed to work in bytes because that's how wikidiff2 indexes
|
self.to_par_move_dict = {}
|
||||||
# self.from_last_end_bytes = 0
|
self.highlights_without_offset = []
|
||||||
# self.from_last_to_bytes = 0
|
|
||||||
# self.n_from_start_tokens = 0
|
|
||||||
# self.n_from_end_tokens = 0
|
|
||||||
# self.n_from_start_tokens = 0
|
|
||||||
# self.n_to_start_tokens = 0
|
|
||||||
# self.from_last_end_bytes = 0
|
|
||||||
# self.to_last_end_bytes = 0
|
|
||||||
# keeps track of the number of tokens seen so far
|
|
||||||
# to avoid repeated tokenization
|
|
||||||
# self.from_byte_token_index_map: SortedDict[int, int] = SortedDict()
|
|
||||||
# self.to_byte_token_index_map: SortedDict[int, int] = SortedDict()
|
|
||||||
self.par_move_dict = {}
|
|
||||||
|
|
||||||
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
|
# we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets.
|
||||||
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
||||||
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict()
|
||||||
# def get_token_offset(self, byte_offset):
|
|
||||||
# from_token_start = None
|
|
||||||
# to_token_start = None
|
|
||||||
# from_last_end_bytes = self.from_byte_token_index_map.keys()[-1]
|
|
||||||
# to_last_end_bytes = self.to_byte_token_index_map.keys()[-1]
|
|
||||||
# if byte_offset['from'] is not None:
|
|
||||||
# if byte_offset['from'] < self.from_byte_token_index_map.values()[0]:
|
|
||||||
# from_token_start = 0
|
|
||||||
# else:
|
|
||||||
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from'])
|
|
||||||
# # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below?
|
|
||||||
# if key > from_last_end_bytes:
|
|
||||||
# from_token_start = self.from_byte_token_index_map[from_last_end_bytes]
|
|
||||||
# else:
|
|
||||||
# from_token_
|
|
||||||
# if byte_offset['to'] is not None:
|
|
||||||
# if byte_offset['to'] < self.to_byte_token_index_map.values()[0]:
|
|
||||||
# to_token_start = 0
|
|
||||||
# else:
|
|
||||||
# key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to'])
|
|
||||||
# if key >= from
|
|
||||||
# if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0:
|
|
||||||
# if (
|
|
||||||
# byte_offset['from'] >= self.from_last_end_bytes
|
|
||||||
# ): # if the from paragraph is at the end
|
|
||||||
# from_token_start = next(
|
|
||||||
# reversed(self.from_byte_token_index_map.values())
|
|
||||||
# )
|
|
||||||
# else:
|
|
||||||
# key = find_greatest_le_key(
|
|
||||||
# byte_offset['from'], self.from_byte_token_index_map
|
|
||||||
# )
|
|
||||||
# from_token_start = self.from_byte_token_index_map[key]
|
|
||||||
# else:
|
|
||||||
# from_token_start = 0
|
|
||||||
|
|
||||||
# to_offset = None
|
|
||||||
# if byte_offset['to'] is not None:
|
|
||||||
# if len(self.to_byte_token_index_map) > 0:
|
|
||||||
# if to_byte_start >= self.to_last_end_bytes:
|
|
||||||
# to_token_start = next(reversed(self.to_byte_token_index_map.values()))
|
|
||||||
# else:
|
|
||||||
# key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map)
|
|
||||||
# to_token_start = self.to_byte_token_index_map[key]
|
|
||||||
# else:
|
|
||||||
# to_token_start = 0
|
|
||||||
|
|
||||||
# return {'from': from_token_start,
|
|
||||||
# 'to': to_token_start}
|
|
||||||
|
|
||||||
def tokenize(self, bytes):
|
def tokenize(self, bytes):
|
||||||
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
return self.tokenizer.tokenize(bytes.decode("utf-8"))
|
||||||
|
|
||||||
def to_operations(self):
|
def to_operations(self):
|
||||||
|
|
||||||
for entry in self.diff["diff"]:
|
for entry in self.diff["diff"]:
|
||||||
|
# add back the newline
|
||||||
entry["text"] += "\n"
|
entry["text"] += "\n"
|
||||||
text = entry["text"]
|
text = entry["text"]
|
||||||
offset = entry["offset"]
|
offset = entry["offset"]
|
||||||
if offset["from"] and entry.get("lineNumber") is not None :
|
|
||||||
if entry['type'] in [0, 2, 3, 4]:
|
|
||||||
self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode())
|
|
||||||
|
|
||||||
if offset["to"]:
|
|
||||||
if entry['type'] in [0, 1, 3, 5]:
|
|
||||||
self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode())
|
|
||||||
|
|
||||||
|
|
||||||
# add back the newline
|
|
||||||
|
|
||||||
# this is the first byte of the line in the 'from' revision.
|
# this is the first byte of the line in the 'from' revision.
|
||||||
from_start_line = entry["offset"]["from"]
|
from_start_line = entry["offset"]["from"]
|
||||||
# this is the first byte of the line in the 'to' revision.
|
# this is the first byte of the line in the 'to' revision.
|
||||||
to_start_line = entry["offset"]["to"]
|
to_start_line = entry["offset"]["to"]
|
||||||
|
|
||||||
if entry["type"] == 0:
|
if entry["type"] == 0:
|
||||||
yield from self.doEqual(text, offset)
|
yield from self.doEqual(entry)
|
||||||
|
|
||||||
# a line included in the 'to' revision, but not in the 'from' revision
|
# a line included in the 'to' revision, but not in the 'from' revision
|
||||||
elif entry["type"] == 1:
|
elif entry["type"] == 1:
|
||||||
yield from self.doInsert(text, offset)
|
yield from self.doInsert(entry)
|
||||||
|
|
||||||
# a line included in the 'from' revision, but not in the 'to' revision
|
# a line included in the 'from' revision, but not in the 'to' revision
|
||||||
elif entry["type"] == 2:
|
elif entry["type"] == 2:
|
||||||
yield from self.doDelete(text, offset)
|
yield from self.doDelete(entry)
|
||||||
|
|
||||||
elif entry["type"] == 3:
|
elif entry["type"] == 3:
|
||||||
yield from self.doHighlightRange(
|
# sometimes, for some reason we don't have a 'to' index here. we'll save these for later
|
||||||
text, entry["highlightRanges"], offset, entry["lineNumber"]
|
if entry["offset"]["to"] is None:
|
||||||
)
|
self.highlights_without_offset.append(entry)
|
||||||
|
else:
|
||||||
|
yield from self.doHighlightRange(entry)
|
||||||
|
|
||||||
elif entry["type"] == 4:
|
elif entry["type"] == 4:
|
||||||
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
|
||||||
|
|
||||||
linkId = entry["moveInfo"]["linkId"]
|
linkId = entry["moveInfo"]["linkId"]
|
||||||
if linkId in self.par_move_dict:
|
|
||||||
yield from self.doParMove(entry, self.par_move_dict[linkId])
|
|
||||||
|
|
||||||
# we need to count the tokens in the from revision so token index is correct
|
if linkId in self.to_par_move_dict:
|
||||||
# self.n_from_end_tokens += len(self.tokenize(entry["text"].encode()))
|
yield from self.doParMove(entry, self.to_par_move_dict.pop(linkId))
|
||||||
# self.n_from_start_tokens += len(
|
else:
|
||||||
# self.tokenize(entry["text"].encode())
|
self.from_par_move_dict[entry["moveInfo"]["id"]] = entry
|
||||||
# )
|
|
||||||
|
|
||||||
elif entry["type"] == 5:
|
elif entry["type"] == 5:
|
||||||
linkId = entry["moveInfo"]["linkId"]
|
linkId = entry["moveInfo"]["linkId"]
|
||||||
if linkId in self.par_move_dict:
|
if linkId in self.from_par_move_dict:
|
||||||
yield from self.doParMove(self.par_move_dict[linkId], entry)
|
yield from self.doParMove(
|
||||||
|
self.from_par_move_dict.pop(linkId), entry
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.par_move_dict[entry["moveInfo"]["id"]] = entry
|
self.to_par_move_dict[entry["moveInfo"]["id"]] = entry
|
||||||
# call doHighlightRange just to update the token indices
|
|
||||||
# offset = {
|
|
||||||
# "from": self.n_from_end_tokens,
|
|
||||||
# "to": entry["offset"]["to"],
|
|
||||||
# }
|
|
||||||
# res = self.doHighlightRange(
|
|
||||||
# entry["text"],
|
|
||||||
# entry["highlightRanges"],
|
|
||||||
# offset,
|
|
||||||
# entry["lineNumber"],
|
|
||||||
# update_idx="to",
|
|
||||||
# )
|
|
||||||
# list(res)
|
|
||||||
# self.n_to_end_tokens += len(self.tokenize(entry["text"].encode()))
|
|
||||||
# self.n_to_start_tokens += len(
|
|
||||||
# self.tokenize(entry["text"].encode())
|
|
||||||
# )
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# The 'type' isn't one of the known
|
# The 'type' isn't one of the known
|
||||||
raise ValueError(d)
|
raise ValueError(d)
|
||||||
|
|
||||||
|
# now we should be able to apply highlights
|
||||||
|
|
||||||
|
for entry in self.highlights_without_offset:
|
||||||
|
yield from self.doHighlightRange(entry)
|
||||||
|
|
||||||
|
if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
||||||
|
print("PROBLEM! Unmatched parmoves!")
|
||||||
|
print(self.from_par_move_dict)
|
||||||
|
print(self.to_par_move_dict)
|
||||||
|
# We can try to match them:
|
||||||
|
for lkey in self.from_par_move_dict.keys():
|
||||||
|
for rkey in self.to_par_move_dict.keys():
|
||||||
|
from_diff = self.from_par_move_dict[lkey]
|
||||||
|
to_diff = self.to_par_move_dict[rkey]
|
||||||
|
if self.match_parmoves_exact(from_diff, to_diff):
|
||||||
|
yield from self.doParMove(from_diff, to_diff)
|
||||||
|
del self.to_par_move_dict[lkey]
|
||||||
|
del self.from_par_move_dict[rkey]
|
||||||
|
break
|
||||||
|
|
||||||
|
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
||||||
|
# print("Couldn't find exact matches for all parmoves!")
|
||||||
|
# # we couldn't find all the matches via exact match
|
||||||
|
# # let's try matching based on line number instead
|
||||||
|
# lkeys_to_remove = []
|
||||||
|
# for lkey, from_diff in self.from_par_move_dict.items():
|
||||||
|
# from_linenum = from_diff["moveInfo"]["linkId"].split("_")[2]
|
||||||
|
# rkey_to_remove = None
|
||||||
|
# for rkey, to_diff in self.to_par_move_dict.items():
|
||||||
|
# to_linenum = rkey.split("_")[2]
|
||||||
|
# if from_linenum == to_linenum:
|
||||||
|
# print("Matching on line number")
|
||||||
|
# yield from self.doParMove(from_diff, to_diff)
|
||||||
|
# rkey_to_remove = rkey
|
||||||
|
# lkeys_to_remove.append(lkey)
|
||||||
|
# break
|
||||||
|
# if rkey_to_remove is not None:
|
||||||
|
# del self.to_par_move_dict[rkey_to_remove]
|
||||||
|
# for lkey in lkeys_to_remove:
|
||||||
|
# del self.from_par_move_dict[lkey]
|
||||||
|
|
||||||
|
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
||||||
|
# print("Couldn't find exact matches for all parmoves!")
|
||||||
|
# # we couldn't find all the matches via exact match or line number
|
||||||
|
# # let's try matching based on opIndex instead
|
||||||
|
# lkeys_to_remove = []
|
||||||
|
# for lkey, from_diff in self.from_par_move_dict.items():
|
||||||
|
# rkey_to_remove = None
|
||||||
|
# from_idx = from_diff["moveInfo"]["linkId"].split("_")[1]
|
||||||
|
# for rkey, to_diff in self.to_par_move_dict.items():
|
||||||
|
# to_idx = rkey.split("_")[1]
|
||||||
|
# print(from_idx)
|
||||||
|
# print(to_idx)
|
||||||
|
# if from_idx == to_idx:
|
||||||
|
# yield from self.doParMove(from_diff, to_diff)
|
||||||
|
# rkey_to_remove = rkey
|
||||||
|
# lkeys_to_remove.append(lkey)
|
||||||
|
# if rkey_to_remove is not None:
|
||||||
|
# del self.to_par_move_dict[rkey_to_remove]
|
||||||
|
# for lkey in lkeys_to_remove:
|
||||||
|
# del self.from_par_move_dict[lkey]
|
||||||
|
|
||||||
|
# we couldn't find matches. treat type 4 as removal and type 5 as highlight.
|
||||||
|
for from_diff in self.from_par_move_dict.values():
|
||||||
|
yield from self.doDelete(from_diff)
|
||||||
|
|
||||||
|
# only we don't know the from index; we assume its already handled.
|
||||||
|
for to_diff in self.to_par_move_dict.values():
|
||||||
|
offset["from"] = 0
|
||||||
|
offset["to"] = None
|
||||||
|
diffops = self.doHighlightRange(
|
||||||
|
{
|
||||||
|
"text": to_diff["text"],
|
||||||
|
"highlightRanges": to_diff["highlightRanges"],
|
||||||
|
'offset': offset,
|
||||||
|
'lineNumber': to_diff["lineNumber"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
diffops = [
|
||||||
|
(type(op)(None, None, op.b1, op.b2), [], bseq)
|
||||||
|
for op, _, bseq in diffops
|
||||||
|
if isinstance(op, Insert) or isinstance(op, Equal)
|
||||||
|
]
|
||||||
|
yield from diffops
|
||||||
|
|
||||||
|
def match_parmoves_exact(self, from_diff, to_diff):
|
||||||
|
ops, from_tokens, to_tokens = list(zip(*self.doParMove(from_diff, to_diff)))
|
||||||
|
from_text = "".join(chain.from_iterable(from_tokens))
|
||||||
|
# we know they match if we apply the highlight ranges and the "from" tokens equal the lhs tokens.
|
||||||
|
if from_text == from_diff["text"]:
|
||||||
|
print("MATCH FOUND")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("NO MATCH")
|
||||||
|
print(len(from_text))
|
||||||
|
print(len(from_diff["text"]))
|
||||||
|
return False
|
||||||
|
|
||||||
# mwpersistence expects differences to be represented in order from the
|
# mwpersistence expects differences to be represented in order from the
|
||||||
# result's perspective ("to"), not the previous text. Thus, if a line
|
# result's perspective ("to"), not the previous text. Thus, if a line
|
||||||
# is moved earlier then its insertion should appear before its deletion.
|
# is moved earlier then its insertion should appear before its deletion.
|
||||||
@ -230,12 +215,12 @@ class DiffToOperationMap:
|
|||||||
# strictly increasing, while the "from" segments should merely be
|
# strictly increasing, while the "from" segments should merely be
|
||||||
# non-overlapping.
|
# non-overlapping.
|
||||||
|
|
||||||
def doEqual(self, equal_segment, offset, update_idx="all"):
|
def doEqual(self, entry):
|
||||||
# if from_token_start is None:
|
equal_segment, offset, lineNumber = (
|
||||||
# from_token_start = self.n_from_start_tokens
|
entry["text"],
|
||||||
# if to_token_start is None:
|
entry["offset"],
|
||||||
# to_token_start = self.n_to_start_tokens
|
entry["lineNumber"],
|
||||||
|
)
|
||||||
if isinstance(equal_segment, str):
|
if isinstance(equal_segment, str):
|
||||||
equal_bytes = equal_segment.encode()
|
equal_bytes = equal_segment.encode()
|
||||||
elif isinstance(equal_segment, bytes):
|
elif isinstance(equal_segment, bytes):
|
||||||
@ -243,35 +228,28 @@ class DiffToOperationMap:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(equal_segment)
|
raise ValueError(equal_segment)
|
||||||
|
|
||||||
|
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(equal_bytes)
|
||||||
|
self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(equal_bytes)
|
||||||
|
|
||||||
tokens = self.tokenize(equal_bytes)
|
tokens = self.tokenize(equal_bytes)
|
||||||
n_tokens = len(tokens)
|
n_tokens = len(tokens)
|
||||||
|
|
||||||
# token_offset = self.get_token_offset(offset)
|
|
||||||
|
|
||||||
# n_from_end_tokens = token_offset['from'] + n_tokens
|
|
||||||
# n_to_end_tokens = token_offset['to'] + n_tokens
|
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
Equal(
|
Equal(
|
||||||
offset['from'],
|
offset["from"],
|
||||||
None,
|
None,
|
||||||
offset['to'],
|
offset["to"],
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
tokens,
|
tokens,
|
||||||
tokens,
|
tokens,
|
||||||
)
|
)
|
||||||
|
|
||||||
# if update_idx in ["from", "all"]:
|
def doInsert(self, entry):
|
||||||
# self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens
|
insert_segment, offset, lineNumber = (
|
||||||
|
entry["text"],
|
||||||
# if update_idx in ["to", "all"]:
|
entry["offset"],
|
||||||
# self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens
|
entry["lineNumber"],
|
||||||
|
)
|
||||||
# self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
|
|
||||||
# self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
|
|
||||||
|
|
||||||
def doInsert(self, insert_segment, offset, update_idx="all"):
|
|
||||||
if isinstance(insert_segment, str):
|
if isinstance(insert_segment, str):
|
||||||
insert_bytes = insert_segment.encode()
|
insert_bytes = insert_segment.encode()
|
||||||
elif isinstance(insert_segment, bytes):
|
elif isinstance(insert_segment, bytes):
|
||||||
@ -279,23 +257,24 @@ class DiffToOperationMap:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(insert_segment)
|
raise ValueError(insert_segment)
|
||||||
tokens = self.tokenize(insert_bytes)
|
tokens = self.tokenize(insert_bytes)
|
||||||
# n_tokens = len(tokens)
|
self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(insert_bytes)
|
||||||
# token_offset = self.get_token_offset(offset)
|
|
||||||
# n_to_end_tokens = token_offset['to'] + n_tokens
|
|
||||||
yield (
|
yield (
|
||||||
Insert(
|
Insert(
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
offset['to'],
|
offset["to"],
|
||||||
None,
|
None,
|
||||||
),
|
),
|
||||||
[],
|
[],
|
||||||
tokens,
|
tokens,
|
||||||
)
|
)
|
||||||
# We have now used more of the "to" tokens.
|
|
||||||
#self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens
|
|
||||||
|
|
||||||
def doDelete(self, delete_segment, offset, update_idx="all", type=str):
|
def doDelete(self, entry):
|
||||||
|
delete_segment, offset, lineNumber = (
|
||||||
|
entry["text"],
|
||||||
|
entry["offset"],
|
||||||
|
entry.get("lineNumber", None),
|
||||||
|
)
|
||||||
if isinstance(delete_segment, str):
|
if isinstance(delete_segment, str):
|
||||||
delete_bytes = delete_segment.encode()
|
delete_bytes = delete_segment.encode()
|
||||||
elif isinstance(delete_segment, bytes):
|
elif isinstance(delete_segment, bytes):
|
||||||
@ -303,26 +282,22 @@ class DiffToOperationMap:
|
|||||||
else:
|
else:
|
||||||
raise ValueError(delete_segment)
|
raise ValueError(delete_segment)
|
||||||
tokens = self.tokenize(delete_bytes)
|
tokens = self.tokenize(delete_bytes)
|
||||||
# n_tokens = len(tokens)
|
if lineNumber is not None:
|
||||||
|
self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes)
|
||||||
# token_offset = self.get_token_offset(offset)
|
|
||||||
# n_from_end_tokens = token_offset['from'] + n_tokens
|
|
||||||
|
|
||||||
yield (
|
yield (
|
||||||
Delete(
|
Delete(offset["from"], None, None, None),
|
||||||
offset['from'],
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
None
|
|
||||||
),
|
|
||||||
tokens,
|
tokens,
|
||||||
[],
|
[],
|
||||||
)
|
)
|
||||||
|
|
||||||
#self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens
|
def doHighlightRange(self, entry):
|
||||||
|
highlight_text, highlightRanges, offset, lineNumber = (
|
||||||
def doHighlightRange(
|
entry["text"],
|
||||||
self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"):
|
entry["highlightRanges"],
|
||||||
|
entry["offset"],
|
||||||
|
entry["lineNumber"],
|
||||||
|
)
|
||||||
|
|
||||||
# The text field is an overlapping mix of both the from and to,
|
# The text field is an overlapping mix of both the from and to,
|
||||||
# so we need to handle it highlight-by-highlight.
|
# so we need to handle it highlight-by-highlight.
|
||||||
@ -334,15 +309,22 @@ class DiffToOperationMap:
|
|||||||
|
|
||||||
# it's possible for offset['to'] to be null.
|
# it's possible for offset['to'] to be null.
|
||||||
# we can get it from the line number?
|
# we can get it from the line number?
|
||||||
update_linenumber_map = True
|
# this bit is a little hacky as it deals with ideosyncratic wikidiff2 behavior
|
||||||
if offset["to"] is None:
|
if offset["to"] is None:
|
||||||
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
|
# if the line already exists, we insert before it.
|
||||||
if keyidx > 0:
|
if lineNumber in self.to_linenumber_bytes_map:
|
||||||
print(self.to_linenumber_bytes_map)
|
keyidx = self.to_linenumber_bytes_map.bisect_left(lineNumber) - 1
|
||||||
key = self.to_linenumber_bytes_map.keys()[keyidx]
|
|
||||||
offset["to"] = self.to_linenumber_bytes_map[key]
|
|
||||||
else:
|
else:
|
||||||
|
keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1
|
||||||
|
key = None
|
||||||
|
if keyidx == -1:
|
||||||
offset["to"] = 0
|
offset["to"] = 0
|
||||||
|
elif len(self.to_linenumber_bytes_map.keys()) > 0:
|
||||||
|
key = self.to_linenumber_bytes_map.keys()[keyidx]
|
||||||
|
else:
|
||||||
|
key = 0
|
||||||
|
if key is not None:
|
||||||
|
offset["to"] = self.to_linenumber_bytes_map.get(key, 0)
|
||||||
|
|
||||||
highlight_offset = offset
|
highlight_offset = offset
|
||||||
# note that diffs are token-level, but the indexes are byte-level
|
# note that diffs are token-level, but the indexes are byte-level
|
||||||
@ -353,13 +335,16 @@ class DiffToOperationMap:
|
|||||||
if highlight_start > highlight_end:
|
if highlight_start > highlight_end:
|
||||||
equal_bytes = highlight_bytes[highlight_end:highlight_start]
|
equal_bytes = highlight_bytes[highlight_end:highlight_start]
|
||||||
n_equal_bytes = len(equal_bytes)
|
n_equal_bytes = len(equal_bytes)
|
||||||
|
|
||||||
yield from self.doEqual(
|
yield from self.doEqual(
|
||||||
equal_bytes, highlight_offset, update_idx=update_idx
|
{
|
||||||
|
"text": equal_bytes,
|
||||||
|
"offset": highlight_offset,
|
||||||
|
"lineNumber": lineNumber,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
highlight_offset["from"] += n_equal_bytes
|
highlight_offset["from"] += n_equal_bytes
|
||||||
highlight_offset["to"] += n_equal_bytes
|
highlight_offset["to"] += n_equal_bytes
|
||||||
if update_linenumber_map:
|
|
||||||
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
|
|
||||||
|
|
||||||
# handle highlighted insert / delete
|
# handle highlighted insert / delete
|
||||||
highlight_end = highlight_start + highlightRange["length"]
|
highlight_end = highlight_start + highlightRange["length"]
|
||||||
@ -368,14 +353,20 @@ class DiffToOperationMap:
|
|||||||
|
|
||||||
if highlightRange["type"] == 0:
|
if highlightRange["type"] == 0:
|
||||||
yield from self.doInsert(
|
yield from self.doInsert(
|
||||||
range_bytes, highlight_offset, update_idx=update_idx
|
{
|
||||||
|
"text": range_bytes,
|
||||||
|
"offset": highlight_offset,
|
||||||
|
"lineNumber": lineNumber,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
highlight_offset["to"] += n_range_bytes
|
highlight_offset["to"] += n_range_bytes
|
||||||
if update_linenumber_map:
|
|
||||||
self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to']
|
|
||||||
elif highlightRange["type"] == 1:
|
elif highlightRange["type"] == 1:
|
||||||
yield from self.doDelete(
|
yield from self.doDelete(
|
||||||
range_bytes, highlight_offset, update_idx=update_idx
|
{
|
||||||
|
"text": range_bytes,
|
||||||
|
"offset": highlight_offset,
|
||||||
|
"lineNumber": lineNumber,
|
||||||
|
}
|
||||||
)
|
)
|
||||||
highlight_offset["from"] += n_range_bytes
|
highlight_offset["from"] += n_range_bytes
|
||||||
else:
|
else:
|
||||||
@ -384,16 +375,25 @@ class DiffToOperationMap:
|
|||||||
# handle the rest of the line which is equal
|
# handle the rest of the line which is equal
|
||||||
if highlight_end < len(highlight_bytes):
|
if highlight_end < len(highlight_bytes):
|
||||||
range_bytes = highlight_bytes[highlight_end:]
|
range_bytes = highlight_bytes[highlight_end:]
|
||||||
yield from self.doEqual(range_bytes, highlight_offset)
|
yield from self.doEqual(
|
||||||
|
{
|
||||||
|
"text": range_bytes,
|
||||||
|
"offset": highlight_offset,
|
||||||
|
"lineNumber": lineNumber,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def doParMove(self, from_diff, to_diff):
|
def doParMove(self, from_diff, to_diff):
|
||||||
# the tricky part here is to put the tokens in the right spots.
|
|
||||||
from_byte_start = from_diff["offset"]["from"]
|
from_byte_start = from_diff["offset"]["from"]
|
||||||
to_byte_start = to_diff["offset"]["to"]
|
to_byte_start = to_diff["offset"]["to"]
|
||||||
offset = {"from": from_byte_start, "to": to_byte_start}
|
offset = {"from": from_byte_start, "to": to_byte_start}
|
||||||
# we need to cache the indexes; replace them; then restore
|
|
||||||
yield from self.doHighlightRange(
|
yield from self.doHighlightRange(
|
||||||
to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"]
|
{
|
||||||
|
"text": to_diff["text"],
|
||||||
|
"highlightRanges": to_diff["highlightRanges"],
|
||||||
|
'offset': offset,
|
||||||
|
'lineNumber': to_diff["lineNumber"],
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -403,9 +403,13 @@ class WikiDiffMatcher:
|
|||||||
texts: list[str] = None,
|
texts: list[str] = None,
|
||||||
tokenizer: Optional[RegexTokenizer] = None,
|
tokenizer: Optional[RegexTokenizer] = None,
|
||||||
url: Optional[str] = "http://127.0.0.1:8000",
|
url: Optional[str] = "http://127.0.0.1:8000",
|
||||||
|
server=False
|
||||||
):
|
):
|
||||||
# Pre-compute diffs to reduce traffic overhead.
|
# Pre-compute diffs to reduce traffic overhead.
|
||||||
self.diffs = compute_diffs(url, texts)
|
if server is True:
|
||||||
|
self.diffs = list(compute_diffs_server(list(texts),url))
|
||||||
|
else:
|
||||||
|
self.diffs = list(compute_diffs(list(texts)))
|
||||||
self.tokenizer = tokenizer or TOKENIZER
|
self.tokenizer = tokenizer or TOKENIZER
|
||||||
|
|
||||||
class Processor(DiffEngine.Processor):
|
class Processor(DiffEngine.Processor):
|
||||||
@ -429,36 +433,33 @@ class WikiDiffMatcher:
|
|||||||
# this happens when revisions are actually equal.
|
# this happens when revisions are actually equal.
|
||||||
if len(diffops) == 0:
|
if len(diffops) == 0:
|
||||||
self.last_tokens = self.tokenizer.tokenize(text)
|
self.last_tokens = self.tokenizer.tokenize(text)
|
||||||
ops = [Equal(0, len(self.last_tokens),
|
ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))]
|
||||||
0, len(self.last_tokens))]
|
|
||||||
return ops, self.last_tokens, self.last_tokens
|
return ops, self.last_tokens, self.last_tokens
|
||||||
|
|
||||||
# we get back the byte indices; now we transform to token indices
|
# we get back the byte indices; now we transform to token indices
|
||||||
|
|
||||||
diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1))
|
diffops.sort(
|
||||||
aorder_ops = []
|
key=lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)
|
||||||
|
)
|
||||||
|
aorder_ops = []
|
||||||
token_offset = 0
|
token_offset = 0
|
||||||
_, aseq, _ = list(zip( * diffops))
|
_, aseq, _ = list(zip(*diffops))
|
||||||
|
|
||||||
for op, tokens, _ in diffops:
|
for op, tokens, _ in diffops:
|
||||||
a1 = token_offset
|
a1 = token_offset
|
||||||
if isinstance(op, Equal) or isinstance(op, Delete):
|
if isinstance(op, Equal) or isinstance(op, Delete):
|
||||||
token_offset += len(tokens)
|
token_offset += len(tokens)
|
||||||
a2 = token_offset
|
a2 = token_offset
|
||||||
aorder_ops.append(type(op)(a1,
|
aorder_ops.append(type(op)(a1, a2, op.b1, op.b1))
|
||||||
a2,
|
|
||||||
op.b1,
|
|
||||||
op.b1))
|
|
||||||
else:
|
else:
|
||||||
aorder_ops.append(Insert(a1,
|
aorder_ops.append(Insert(a1, a1, op.b1, op.b1))
|
||||||
a1,
|
|
||||||
op.b1,
|
|
||||||
op.b1))
|
|
||||||
|
|
||||||
_, aseq, bseq = zip(* diffops)
|
_, aseq, bseq = zip(*diffops)
|
||||||
diffops = list(zip(aorder_ops, aseq, bseq))
|
diffops = list(zip(aorder_ops, aseq, bseq))
|
||||||
diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1))
|
diffops.sort(
|
||||||
_, _, bseq = list(zip(* diffops))
|
key=lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)
|
||||||
|
)
|
||||||
|
_, _, bseq = list(zip(*diffops))
|
||||||
border_ops = []
|
border_ops = []
|
||||||
token_offset = 0
|
token_offset = 0
|
||||||
for op, _, tokens in diffops:
|
for op, _, tokens in diffops:
|
||||||
@ -466,16 +467,10 @@ class WikiDiffMatcher:
|
|||||||
if isinstance(op, Equal) or isinstance(op, Insert):
|
if isinstance(op, Equal) or isinstance(op, Insert):
|
||||||
token_offset += len(tokens)
|
token_offset += len(tokens)
|
||||||
b2 = token_offset
|
b2 = token_offset
|
||||||
border_ops.append(type(op)(op.a1,
|
border_ops.append(type(op)(op.a1, op.a2, b1, b2))
|
||||||
op.a2,
|
|
||||||
b1,
|
|
||||||
b2))
|
|
||||||
else:
|
else:
|
||||||
border_ops.append(type(op)(op.a1,
|
border_ops.append(type(op)(op.a1, op.a2, b1, b1))
|
||||||
op.a2,
|
|
||||||
b1,
|
|
||||||
b1))
|
|
||||||
|
|
||||||
self.previous_text = text
|
self.previous_text = text
|
||||||
|
|
||||||
self.last_tokens = list(chain.from_iterable(aseq))
|
self.last_tokens = list(chain.from_iterable(aseq))
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
header("Cache-Control: no-store, no-cache, must-revalidate, max-age=0");
|
||||||
|
header("Cache-Control: post-check=0, pre-check=0", false);
|
||||||
|
header("Pragma: no-cache");
|
||||||
|
|
||||||
// Launch this server with:
|
// Launch this server with:
|
||||||
// php -S localhost:8000 -q -c php.ini
|
// php -S localhost:8000 -q -c php.ini
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user