compare pywikidiff2 to making requests to wikidiff2.
This commit is contained in:
		
							parent
							
								
									4654911533
								
							
						
					
					
						commit
						14e819e565
					
				
							
								
								
									
										3
									
								
								php.ini
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								php.ini
									
									
									
									
									
								
							| @ -8,4 +8,5 @@ wikidiff2.initial_split_threshold = 0.1 | |||||||
| wikidiff2.final_split_threshold = 0.6 | wikidiff2.final_split_threshold = 0.6 | ||||||
| 
 | 
 | ||||||
| ; It is possible this limit will need to be larger for some pages. | ; It is possible this limit will need to be larger for some pages. | ||||||
| post_max_size = 1000M | post_max_size = 10000M | ||||||
|  | opcache.enable=0 | ||||||
|  | |||||||
| @ -12,6 +12,7 @@ dependencies = [ | |||||||
|     "mwtypes>=0.4.0", |     "mwtypes>=0.4.0", | ||||||
|     "mwxml>=0.3.6", |     "mwxml>=0.3.6", | ||||||
|     "pyarrow>=20.0.0", |     "pyarrow>=20.0.0", | ||||||
|  |     "pywikidiff2", | ||||||
|     "sortedcontainers>=2.4.0", |     "sortedcontainers>=2.4.0", | ||||||
|     "yamlconf>=0.2.6", |     "yamlconf>=0.2.6", | ||||||
| ] | ] | ||||||
| @ -20,6 +21,7 @@ dependencies = [ | |||||||
| yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | ||||||
| mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | ||||||
| deltas = { git = "https://github.com/groceryheist/deltas" } | deltas = { git = "https://github.com/groceryheist/deltas" } | ||||||
|  | pywikidiff2 = { git = "https://gitea.communitydata.science/groceryheist/pywikidiff2" } | ||||||
| 
 | 
 | ||||||
| [dependency-groups] | [dependency-groups] | ||||||
| dev = [ | dev = [ | ||||||
| @ -27,4 +29,5 @@ dev = [ | |||||||
|     "pandas>=2.1.0", |     "pandas>=2.1.0", | ||||||
|     "pytest>=8.4.1", |     "pytest>=8.4.1", | ||||||
|     "pytest-asyncio>=1.0.0", |     "pytest-asyncio>=1.0.0", | ||||||
|  |     "pytest-benchmark>=5.1.0", | ||||||
| ] | ] | ||||||
|  | |||||||
| @ -1,2 +1,2 @@ | |||||||
| #!/usr/bin/env bash | #!/usr/bin/env bash | ||||||
| uv run pytest test/test_wiki_diff_matcher.py::test_addition --capture=tee-sys | uv run pytest test/test_wiki_diff_matcher.py --capture=tee-sys | ||||||
|  | |||||||
| @ -1,6 +1,7 @@ | |||||||
| # start the server | # start the server | ||||||
| import asyncio | import asyncio | ||||||
| import subprocess | import subprocess | ||||||
|  | from itertools import chain | ||||||
| from functools import partial | from functools import partial | ||||||
| import re | import re | ||||||
| import pytest | import pytest | ||||||
| @ -8,14 +9,13 @@ import pytest_asyncio | |||||||
| from typing import List | from typing import List | ||||||
| from deltas import Delete, Equal, Insert, wikitext_split | from deltas import Delete, Equal, Insert, wikitext_split | ||||||
| from mwpersistence import Token | from mwpersistence import Token | ||||||
| 
 |  | ||||||
| from wiki_diff_matcher import WikiDiffMatcher | from wiki_diff_matcher import WikiDiffMatcher | ||||||
| 
 | 
 | ||||||
| @pytest_asyncio.fixture(scope="module", autouse=True) | @pytest_asyncio.fixture(scope="module", autouse=False) | ||||||
| async def start_stop_server(): | async def start_stop_server(): | ||||||
|     print("starting server") |     print("starting server") | ||||||
|     proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000", |     proc = await asyncio.create_subprocess_exec("php", "-S", "127.0.0.1:8000", | ||||||
|                                                 "wikidiff2_api.php", |                                                 "wikidiff2_api.php", "-c", "php.ini", | ||||||
|                                                 stdout=subprocess.PIPE, |                                                 stdout=subprocess.PIPE, | ||||||
|                                                 stderr=subprocess.PIPE) |                                                 stderr=subprocess.PIPE) | ||||||
|     # php needs a moment to actually start |     # php needs a moment to actually start | ||||||
| @ -26,16 +26,24 @@ async def start_stop_server(): | |||||||
|     stdout, stderr = await proc.communicate() |     stdout, stderr = await proc.communicate() | ||||||
|     print(stdout.decode()) |     print(stdout.decode()) | ||||||
|     print(stderr.decode()) |     print(stderr.decode()) | ||||||
|  | 
 | ||||||
|  | def _replace_whitespace(match): | ||||||
|  |     if match.group(1):  # If spaces matched (e.g., '  ') | ||||||
|  |         return ' ' | ||||||
|  |     elif match.group(2): # If newlines matched (e.g., '\n\n') | ||||||
|  |         return '\n' | ||||||
|  |     elif match.group(3): # If tabs matched (e.g., '\t\t') | ||||||
|  |         return '\t' | ||||||
|  |     return '' # Should not be reached if pattern is comprehensive | ||||||
|      |      | ||||||
| def assert_equal_enough(tokens:List[Token], rev): | def assert_equal_enough(tokens:List[Token], rev): | ||||||
|     # the tokens exclude newlines |     # the tokens exclude newlines | ||||||
|     # we allow extra whitespace at the beginning or end |     # we allow extra whitespace at the beginning or end | ||||||
|     token_doc = ''.join(str(t) for t in tokens) |     token_doc = ''.join(str(t) for t in tokens) | ||||||
|     token_doc = re.sub(r'\s+', ' ', token_doc).strip() |  | ||||||
|     rev = re.sub(r'\s+', ' ', rev).strip() |  | ||||||
| 
 |  | ||||||
|     print(token_doc, file = open('token','w')) |     print(token_doc, file = open('token','w')) | ||||||
|     print(rev, file = open('rev','w')) |     print(rev, file = open('rev','w')) | ||||||
|  |     token_doc = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, token_doc).strip() | ||||||
|  |     rev = re.sub(r'( +)|(\n+)|(\t+)', _replace_whitespace, rev).strip() | ||||||
|     assert token_doc == rev |     assert token_doc == rev | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -136,6 +144,26 @@ def test_highlight_range(): | |||||||
|     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
|     assert_equal_enough(b, rev2) |     assert_equal_enough(b, rev2) | ||||||
| 
 | 
 | ||||||
|  | def test_unmatched_parmoves(): | ||||||
|  |     rev1 = open("test/test_diff_revisions/test_unmatched_parmoves_from").read() | ||||||
|  |     rev2 = open("test/test_diff_revisions/test_unmatched_parmoves_to").read() | ||||||
|  |     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
|  |     diff_processor = matcher.processor() | ||||||
|  |     diff_processor.process(rev1) | ||||||
|  |     ops, a, b = diff_processor.process(rev2) | ||||||
|  |     assert_equal_enough(a, rev1) | ||||||
|  |     assert_equal_enough(b, rev2) | ||||||
|  | 
 | ||||||
|  | def test_bug_4(): | ||||||
|  |     rev1 = open("test/test_diff_revisions/test_bug_4_from").read() | ||||||
|  |     rev2 = open("test/test_diff_revisions/test_bug_4_to").read() | ||||||
|  |     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
|  |     diff_processor = matcher.processor() | ||||||
|  |     diff_processor.process(rev1) | ||||||
|  |     ops, a, b = diff_processor.process(rev2) | ||||||
|  |     assert_equal_enough(a, rev1) | ||||||
|  |     assert_equal_enough(b, rev2) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_delete(): | def test_delete(): | ||||||
|     rev1 = open("test/test_diff_revisions/1295229484").read() |     rev1 = open("test/test_diff_revisions/1295229484").read() | ||||||
| @ -295,18 +323,31 @@ def test_leading_whitespace(): | |||||||
|     assert_equal_enough(b, rev2) |     assert_equal_enough(b, rev2) | ||||||
|     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
| 
 | 
 | ||||||
| # def test_whitespace_2(): | def test_whitespace_bug(): | ||||||
| #     rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read() |     rev1 = open("test/test_diff_revisions/test_whitespace_bug_from").read() | ||||||
| #     rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read() |     rev2 = open("test/test_diff_revisions/test_whitespace_bug_to").read() | ||||||
| #     matcher = WikiDiffMatcher([rev1,rev2]) |     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
| #     diff_processor = matcher.processor() |     diff_processor = matcher.processor() | ||||||
| 
 | 
 | ||||||
| #     # note that a and b are constructed from the diffs. |     # note that a and b are constructed from the diffs. | ||||||
| #     # so they reflect the state of the text according to the diff processor |     # so they reflect the state of the text according to the diff processor | ||||||
| #     ops, a, b = diff_processor.process(rev1) |     ops, a, b = diff_processor.process(rev1) | ||||||
| #     ops, a, b = diff_processor.process(rev2) |     ops, a, b = diff_processor.process(rev2) | ||||||
| #     assert_equal_enough(b, rev2) |     assert_equal_enough(b, rev2) | ||||||
| #     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
|  | 
 | ||||||
|  | def test_bug_3(): | ||||||
|  |     rev1 = open("test/test_diff_revisions/test_bug_3_from").read() | ||||||
|  |     rev2 = open("test/test_diff_revisions/test_bug_3_to").read() | ||||||
|  |     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
|  |     diff_processor = matcher.processor() | ||||||
|  | 
 | ||||||
|  |     # note that a and b are constructed from the diffs. | ||||||
|  |     # so they reflect the state of the text according to the diff processor | ||||||
|  |     ops, a, b = diff_processor.process(rev1) | ||||||
|  |     ops, a, b = diff_processor.process(rev2) | ||||||
|  |     assert_equal_enough(b, rev2) | ||||||
|  |     #assert_equal_enough(a, rev1) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -326,15 +367,14 @@ def test_actually_equal(): | |||||||
|     assert_equal_enough(b, rev1) |     assert_equal_enough(b, rev1) | ||||||
|     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
|      |      | ||||||
| # slow test | # slow test. comment out the following line to enable it. | ||||||
|  | @pytest.mark.skip | ||||||
| def test_diff_consistency(): | def test_diff_consistency(): | ||||||
|     from mwxml import Dump |     from mwxml import Dump | ||||||
|     stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/sailormoon.xml.7z", "*.xml"], stdout=subprocess.PIPE).stdout |     #stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout | ||||||
| 
 |     dump = Dump.from_file("test/dumps/ikwiki.xml") | ||||||
|     dump = Dump.from_file(stream) |  | ||||||
|     for page in dump: |     for page in dump: | ||||||
|         revisions = [rev.text for rev in page if rev.text] |         revisions = [rev.text for rev in page if rev.text] | ||||||
| 
 |  | ||||||
|         matcher = WikiDiffMatcher(revisions) |         matcher = WikiDiffMatcher(revisions) | ||||||
|         diff_processor = matcher.processor() |         diff_processor = matcher.processor() | ||||||
|         last_rev = "" |         last_rev = "" | ||||||
| @ -342,7 +382,44 @@ def test_diff_consistency(): | |||||||
|             print(rev, file=open("test_unicode_highlight_to",'w')) |             print(rev, file=open("test_unicode_highlight_to",'w')) | ||||||
|             print(last_rev, file=open("test_unicode_highlight_from",'w')) |             print(last_rev, file=open("test_unicode_highlight_from",'w')) | ||||||
|             ops, a, b = diff_processor.process(rev) |             ops, a, b = diff_processor.process(rev) | ||||||
|             #assert_equal_enough(a, last_rev) |             assert_equal_enough(a, last_rev) | ||||||
| 
 |  | ||||||
|             assert_equal_enough(b, rev) |             assert_equal_enough(b, rev) | ||||||
|             last_rev = rev |             last_rev = rev | ||||||
|  | 
 | ||||||
|  | #@pytest.mark.skip | ||||||
|  | def test_benchmark_diff(benchmark): | ||||||
|  |     from mwxml import Dump | ||||||
|  |     dump = Dump.from_file("test/dumps/ikwiki.xml") | ||||||
|  |     revs = chain.from_iterable([rev.text for rev in page] for page in dump) | ||||||
|  |     def next_revs(): | ||||||
|  |         return [next(revs), next(revs)], {} | ||||||
|  | 
 | ||||||
|  |     benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1) | ||||||
|  | 
 | ||||||
|  | def test_benchmark_diff_server(start_stop_server,benchmark): | ||||||
|  |     from mwxml import Dump | ||||||
|  |     dump = Dump.from_file("test/dumps/ikwiki.xml") | ||||||
|  |     revs = chain.from_iterable([rev.text for rev in page] for page in dump) | ||||||
|  |     def next_revs(): | ||||||
|  |         return [next(revs), next(revs)], {'server':True} | ||||||
|  | 
 | ||||||
|  |     benchmark.pedantic(WikiDiffMatcher,setup=next_revs,iterations=1,rounds=1000, warmup_rounds=1) | ||||||
|  | 
 | ||||||
|  | @pytest.mark.skip | ||||||
|  | def test_diff_consistency_server(): | ||||||
|  |     from mwxml import Dump | ||||||
|  |     #stream = subprocess.Popen(["7za", "x", "-so", "test/dumps/ikwiki-20180301-pages-meta-history.xml.bz2", "*.xml"], stdout=subprocess.PIPE).stdout | ||||||
|  |     dump = Dump.from_file("test/dumps/ikwiki.xml") | ||||||
|  |     for page in dump: | ||||||
|  |         revisions = [rev.text for rev in page if rev.text] | ||||||
|  |         matcher = WikiDiffMatcher(revisions,server=True) | ||||||
|  |         diff_processor = matcher.processor() | ||||||
|  |         last_rev = "" | ||||||
|  |         for rev in revisions: | ||||||
|  |             print(rev, file=open("test_unicode_highlight_to",'w')) | ||||||
|  |             print(last_rev, file=open("test_unicode_highlight_from",'w')) | ||||||
|  |             ops, a, b = diff_processor.process(rev) | ||||||
|  |             assert_equal_enough(a, last_rev) | ||||||
|  |             assert_equal_enough(b, rev) | ||||||
|  |             last_rev = rev | ||||||
|  | 
 | ||||||
|  | |||||||
| @ -3,40 +3,18 @@ import sys | |||||||
| from collections import namedtuple | from collections import namedtuple | ||||||
| from itertools import chain | from itertools import chain | ||||||
| from typing import Dict, Generator, List, Optional, Tuple | from typing import Dict, Generator, List, Optional, Tuple | ||||||
| from sortedcontainers import SortedDict | 
 | ||||||
| import requests | import requests | ||||||
| from deltas import (Delete, DiffEngine, Equal, Insert, Operation, | from deltas import (Delete, DiffEngine, Equal, Insert, Operation, | ||||||
|                     RegexTokenizer, Token, tokenizers) |                     RegexTokenizer, Token, tokenizers) | ||||||
|  | from sortedcontainers import SortedDict | ||||||
| 
 | 
 | ||||||
| TOKENIZER = tokenizers.wikitext_split | TOKENIZER = tokenizers.wikitext_split | ||||||
|  | import pywikidiff2  | ||||||
|  | differ = pywikidiff2.pywikidiff2(numContextLines=1000000, | ||||||
|  |                                  moved_paragraph_detection_cutoff=200000) | ||||||
| 
 | 
 | ||||||
| # def find_greatest_le_key(target_key, data_dict): | def compute_diffs_server(texts, url="http://127.0.0.1:8000"): | ||||||
| #     found_key = None |  | ||||||
| #     for key in data_dict:  # Iterates over keys in insertion order (which is sorted) |  | ||||||
| #         if key <= target_key: |  | ||||||
| #             found_key = ( |  | ||||||
| #                 key  # This is the largest key found so far that satisfies the condition |  | ||||||
| #             ) |  | ||||||
| #         else: |  | ||||||
| #             # Since the dictionary is sorted, if key > target_key, |  | ||||||
| #             # all subsequent keys will also be > target_key. |  | ||||||
| #             return found_key or key |  | ||||||
| 
 |  | ||||||
| # def find_smallest_gt_key(target_key, data_dict): |  | ||||||
| #     found_key = None |  | ||||||
| #     for key in reversed(data_dict):  # Iterates over keys in insertion order (which is sorted) |  | ||||||
| #         if key >= target_key: |  | ||||||
| #             found_key = ( |  | ||||||
| #                 key  # This is the largest key found so far that satisfies the condition |  | ||||||
| #             ) |  | ||||||
| #         else: |  | ||||||
| #             # Since the dictionary is sorted, if key > target_key, |  | ||||||
| #             # all subsequent keys will also be > target_key. |  | ||||||
| #             return found_key or key |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def compute_diffs(url: str, texts: list[str]) -> list: |  | ||||||
|     response = None |     response = None | ||||||
|     try: |     try: | ||||||
|         response = requests.post(url, json=texts) |         response = requests.post(url, json=texts) | ||||||
| @ -63,166 +41,173 @@ def compute_diffs(url: str, texts: list[str]) -> list: | |||||||
|     except requests.exceptions.RequestException as e: |     except requests.exceptions.RequestException as e: | ||||||
|         print(f"An unexpected error occurred: {e}") |         print(f"An unexpected error occurred: {e}") | ||||||
|         raise e |         raise e | ||||||
| 
 |  | ||||||
|     return incremental_diffs |     return incremental_diffs | ||||||
|  |          | ||||||
| 
 | 
 | ||||||
|  | def compute_diffs(texts: list[str]) -> list: | ||||||
|  |     return differ.inline_json_diff_sequence(texts) | ||||||
| 
 | 
 | ||||||
| class DiffToOperationMap: | class DiffToOperationMap: | ||||||
|     def __init__(self, diff, tokenizer): |     def __init__(self, diff, tokenizer): | ||||||
|         self.tokenizer = tokenizer |         self.tokenizer = tokenizer | ||||||
|         self.diff = json.loads(diff) |         self.diff = json.loads(diff) | ||||||
| 
 |         self.from_par_move_dict = {} | ||||||
|         # the code below is designed to work in bytes because that's how wikidiff2 indexes |         self.to_par_move_dict = {} | ||||||
|         # self.from_last_end_bytes = 0 |         self.highlights_without_offset = [] | ||||||
|         # self.from_last_to_bytes = 0 |  | ||||||
|         # self.n_from_start_tokens = 0 |  | ||||||
|         # self.n_from_end_tokens = 0 |  | ||||||
|         # self.n_from_start_tokens = 0 |  | ||||||
|         # self.n_to_start_tokens = 0 |  | ||||||
|         # self.from_last_end_bytes = 0 |  | ||||||
|         # self.to_last_end_bytes = 0 |  | ||||||
|         # keeps track of the number of tokens seen so far |  | ||||||
|         # to avoid repeated tokenization |  | ||||||
|         # self.from_byte_token_index_map: SortedDict[int, int] = SortedDict() |  | ||||||
|         # self.to_byte_token_index_map: SortedDict[int, int] = SortedDict() |  | ||||||
|         self.par_move_dict = {} |  | ||||||
| 
 |  | ||||||
|         # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. |         # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. | ||||||
|         self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() |         self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() | ||||||
|         self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() |         self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() | ||||||
|     # def get_token_offset(self, byte_offset): |  | ||||||
|     #     from_token_start = None |  | ||||||
|     #     to_token_start = None |  | ||||||
|     #     from_last_end_bytes = self.from_byte_token_index_map.keys()[-1] |  | ||||||
|     #     to_last_end_bytes = self.to_byte_token_index_map.keys()[-1] |  | ||||||
|     #     if byte_offset['from'] is not None: |  | ||||||
|     #         if byte_offset['from'] < self.from_byte_token_index_map.values()[0]: |  | ||||||
|     #             from_token_start = 0 |  | ||||||
|     #         else: |  | ||||||
|     #             key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from']) |  | ||||||
|     #         # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below? |  | ||||||
|     #         if key > from_last_end_bytes: |  | ||||||
|     #             from_token_start = self.from_byte_token_index_map[from_last_end_bytes] |  | ||||||
|     #         else: |  | ||||||
|     #             from_token_ |  | ||||||
|     #     if byte_offset['to'] is not None: |  | ||||||
|     #         if byte_offset['to'] < self.to_byte_token_index_map.values()[0]: |  | ||||||
|     #             to_token_start = 0 |  | ||||||
|     #         else: |  | ||||||
|     #             key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to']) |  | ||||||
|     #         if key >= from |  | ||||||
|     #         if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0: |  | ||||||
|     #             if ( |  | ||||||
|     #                     byte_offset['from'] >= self.from_last_end_bytes |  | ||||||
|     #             ):  # if the from paragraph is at the end |  | ||||||
|     #                 from_token_start = next( |  | ||||||
|     #                     reversed(self.from_byte_token_index_map.values()) |  | ||||||
|     #                 ) |  | ||||||
|     #             else: |  | ||||||
|     #                 key = find_greatest_le_key( |  | ||||||
|     #                 byte_offset['from'], self.from_byte_token_index_map |  | ||||||
|     #                 ) |  | ||||||
|     #                 from_token_start = self.from_byte_token_index_map[key] |  | ||||||
|     #         else: |  | ||||||
|     #             from_token_start = 0 |  | ||||||
| 
 |  | ||||||
|     #     to_offset = None |  | ||||||
|     #     if byte_offset['to'] is not None: |  | ||||||
|     #         if len(self.to_byte_token_index_map) > 0: |  | ||||||
|     #             if to_byte_start >= self.to_last_end_bytes: |  | ||||||
|     #                 to_token_start = next(reversed(self.to_byte_token_index_map.values())) |  | ||||||
|     #             else: |  | ||||||
|     #                 key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map) |  | ||||||
|     #                 to_token_start = self.to_byte_token_index_map[key] |  | ||||||
|     #         else: |  | ||||||
|     #             to_token_start = 0 |  | ||||||
| 
 |  | ||||||
|     #     return {'from': from_token_start, |  | ||||||
|     #             'to': to_token_start} |  | ||||||
| 
 | 
 | ||||||
|     def tokenize(self, bytes): |     def tokenize(self, bytes): | ||||||
|         return self.tokenizer.tokenize(bytes.decode("utf-8")) |         return self.tokenizer.tokenize(bytes.decode("utf-8")) | ||||||
| 
 | 
 | ||||||
|     def to_operations(self): |     def to_operations(self): | ||||||
| 
 |  | ||||||
|         for entry in self.diff["diff"]: |         for entry in self.diff["diff"]: | ||||||
| 
 |             # add back the newline | ||||||
|             entry["text"] += "\n" |             entry["text"] += "\n" | ||||||
|             text = entry["text"] |             text = entry["text"] | ||||||
|             offset = entry["offset"] |             offset = entry["offset"] | ||||||
|             if offset["from"] and entry.get("lineNumber") is not None : |  | ||||||
|                 if entry['type'] in [0, 2, 3, 4]: |  | ||||||
|                     self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode()) |  | ||||||
| 
 |  | ||||||
|             if offset["to"]: |  | ||||||
|                 if entry['type'] in [0, 1, 3, 5]: |  | ||||||
|                     self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode()) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|             # add back the newline |  | ||||||
| 
 |  | ||||||
|             # this is the first byte of the line in the 'from' revision. |             # this is the first byte of the line in the 'from' revision. | ||||||
|             from_start_line = entry["offset"]["from"] |             from_start_line = entry["offset"]["from"] | ||||||
|             # this is the first byte of the line in the 'to' revision. |             # this is the first byte of the line in the 'to' revision. | ||||||
|             to_start_line = entry["offset"]["to"] |             to_start_line = entry["offset"]["to"] | ||||||
|              | 
 | ||||||
|             if entry["type"] == 0: |             if entry["type"] == 0: | ||||||
|                 yield from self.doEqual(text, offset) |                 yield from self.doEqual(entry) | ||||||
| 
 | 
 | ||||||
|                 # a line included in the 'to' revision, but not in the 'from' revision |                 # a line included in the 'to' revision, but not in the 'from' revision | ||||||
|             elif entry["type"] == 1: |             elif entry["type"] == 1: | ||||||
|                 yield from self.doInsert(text, offset) |                 yield from self.doInsert(entry) | ||||||
| 
 | 
 | ||||||
|                 # a line included in the 'from' revision, but not in the 'to' revision |                 # a line included in the 'from' revision, but not in the 'to' revision | ||||||
|             elif entry["type"] == 2: |             elif entry["type"] == 2: | ||||||
|                 yield from self.doDelete(text, offset) |                 yield from self.doDelete(entry) | ||||||
| 
 | 
 | ||||||
|             elif entry["type"] == 3: |             elif entry["type"] == 3: | ||||||
|                 yield from self.doHighlightRange( |                 # sometimes, for some reason we don't have a 'to' index here. we'll save these for later | ||||||
|                     text, entry["highlightRanges"], offset, entry["lineNumber"] |                 if entry["offset"]["to"] is None: | ||||||
|                 ) |                     self.highlights_without_offset.append(entry) | ||||||
|  |                 else: | ||||||
|  |                     yield from self.doHighlightRange(entry) | ||||||
| 
 | 
 | ||||||
|             elif entry["type"] == 4: |             elif entry["type"] == 4: | ||||||
|                 self.par_move_dict[entry["moveInfo"]["id"]] = entry |  | ||||||
| 
 |  | ||||||
|                 linkId = entry["moveInfo"]["linkId"] |                 linkId = entry["moveInfo"]["linkId"] | ||||||
|                 if linkId in self.par_move_dict: |  | ||||||
|                     yield from self.doParMove(entry, self.par_move_dict[linkId]) |  | ||||||
| 
 | 
 | ||||||
|                 # we need to count the tokens in the from revision so token index is correct |                 if linkId in self.to_par_move_dict: | ||||||
|                 # self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) |                     yield from self.doParMove(entry, self.to_par_move_dict.pop(linkId)) | ||||||
|                 # self.n_from_start_tokens += len( |                 else: | ||||||
|                 #     self.tokenize(entry["text"].encode()) |                     self.from_par_move_dict[entry["moveInfo"]["id"]] = entry | ||||||
|                 # ) | 
 | ||||||
|   |  | ||||||
|             elif entry["type"] == 5: |             elif entry["type"] == 5: | ||||||
|                 linkId = entry["moveInfo"]["linkId"] |                 linkId = entry["moveInfo"]["linkId"] | ||||||
|                 if linkId in self.par_move_dict: |                 if linkId in self.from_par_move_dict: | ||||||
|                     yield from self.doParMove(self.par_move_dict[linkId], entry) |                     yield from self.doParMove( | ||||||
|  |                         self.from_par_move_dict.pop(linkId), entry | ||||||
|  |                     ) | ||||||
|                 else: |                 else: | ||||||
|                     self.par_move_dict[entry["moveInfo"]["id"]] = entry |                     self.to_par_move_dict[entry["moveInfo"]["id"]] = entry | ||||||
|                     # call doHighlightRange just to update the token indices |  | ||||||
|                     # offset = { |  | ||||||
|                     #     "from": self.n_from_end_tokens, |  | ||||||
|                     #     "to": entry["offset"]["to"], |  | ||||||
|                     # } |  | ||||||
|                     # res = self.doHighlightRange( |  | ||||||
|                     #     entry["text"], |  | ||||||
|                     #     entry["highlightRanges"], |  | ||||||
|                     #     offset, |  | ||||||
|                     #     entry["lineNumber"], |  | ||||||
|                     #     update_idx="to", |  | ||||||
|                     # ) |  | ||||||
|                     # list(res) |  | ||||||
|                 # self.n_to_end_tokens += len(self.tokenize(entry["text"].encode())) |  | ||||||
|                 # self.n_to_start_tokens += len( |  | ||||||
|                 #     self.tokenize(entry["text"].encode()) |  | ||||||
|                 # ) |  | ||||||
| 
 |  | ||||||
|             else: |             else: | ||||||
|                 # The 'type' isn't one of the known |                 # The 'type' isn't one of the known | ||||||
|                 raise ValueError(d) |                 raise ValueError(d) | ||||||
| 
 | 
 | ||||||
|  |         # now we should be able to apply highlights | ||||||
|  | 
 | ||||||
|  |         for entry in self.highlights_without_offset: | ||||||
|  |             yield from self.doHighlightRange(entry) | ||||||
|  | 
 | ||||||
|  |         if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0: | ||||||
|  |             print("PROBLEM! Unmatched parmoves!") | ||||||
|  |             print(self.from_par_move_dict) | ||||||
|  |             print(self.to_par_move_dict) | ||||||
|  |             # We can try to match them: | ||||||
|  |             for lkey in self.from_par_move_dict.keys(): | ||||||
|  |                 for rkey in self.to_par_move_dict.keys(): | ||||||
|  |                     from_diff = self.from_par_move_dict[lkey] | ||||||
|  |                     to_diff = self.to_par_move_dict[rkey] | ||||||
|  |                     if self.match_parmoves_exact(from_diff, to_diff): | ||||||
|  |                         yield from self.doParMove(from_diff, to_diff) | ||||||
|  |                         del self.to_par_move_dict[lkey] | ||||||
|  |                         del self.from_par_move_dict[rkey] | ||||||
|  |                         break | ||||||
|  | 
 | ||||||
|  |         # if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0: | ||||||
|  |         #     print("Couldn't find exact matches for all parmoves!") | ||||||
|  |         #     # we couldn't find all the matches via exact match | ||||||
|  |         #     # let's try matching based on line number instead | ||||||
|  |         #     lkeys_to_remove = [] | ||||||
|  |         #     for lkey, from_diff in self.from_par_move_dict.items(): | ||||||
|  |         #         from_linenum = from_diff["moveInfo"]["linkId"].split("_")[2] | ||||||
|  |         #         rkey_to_remove = None | ||||||
|  |         #         for rkey, to_diff in self.to_par_move_dict.items(): | ||||||
|  |         #             to_linenum = rkey.split("_")[2] | ||||||
|  |         #             if from_linenum == to_linenum: | ||||||
|  |         #                 print("Matching on line number") | ||||||
|  |         #                 yield from self.doParMove(from_diff, to_diff) | ||||||
|  |         #                 rkey_to_remove = rkey | ||||||
|  |         #                 lkeys_to_remove.append(lkey) | ||||||
|  |         #                 break | ||||||
|  |         #         if rkey_to_remove is not None: | ||||||
|  |         #             del self.to_par_move_dict[rkey_to_remove] | ||||||
|  |         #     for lkey in lkeys_to_remove: | ||||||
|  |         #         del self.from_par_move_dict[lkey] | ||||||
|  | 
 | ||||||
|  |         # if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0: | ||||||
|  |         #     print("Couldn't find exact matches for all parmoves!") | ||||||
|  |         #     # we couldn't find all the matches via exact match or line number | ||||||
|  |         #     # let's try matching based on opIndex instead | ||||||
|  |         #     lkeys_to_remove = [] | ||||||
|  |         #     for lkey, from_diff in self.from_par_move_dict.items(): | ||||||
|  |         #         rkey_to_remove = None | ||||||
|  |         #         from_idx = from_diff["moveInfo"]["linkId"].split("_")[1] | ||||||
|  |         #         for rkey, to_diff in self.to_par_move_dict.items(): | ||||||
|  |         #             to_idx = rkey.split("_")[1] | ||||||
|  |         #             print(from_idx) | ||||||
|  |         #             print(to_idx) | ||||||
|  |         #             if from_idx == to_idx: | ||||||
|  |         #                 yield from self.doParMove(from_diff, to_diff) | ||||||
|  |         #                 rkey_to_remove = rkey | ||||||
|  |         #                 lkeys_to_remove.append(lkey) | ||||||
|  |         #         if rkey_to_remove is not None: | ||||||
|  |         #             del self.to_par_move_dict[rkey_to_remove] | ||||||
|  |         #     for lkey in lkeys_to_remove: | ||||||
|  |         #         del self.from_par_move_dict[lkey] | ||||||
|  | 
 | ||||||
|  |         # we couldn't find matches. treat type 4 as removal and type 5 as highlight. | ||||||
|  |         for from_diff in self.from_par_move_dict.values(): | ||||||
|  |             yield from self.doDelete(from_diff) | ||||||
|  | 
 | ||||||
|  |         # only we don't know the from index; we assume its already handled. | ||||||
|  |         for to_diff in self.to_par_move_dict.values(): | ||||||
|  |             offset["from"] = 0 | ||||||
|  |             offset["to"] = None | ||||||
|  |             diffops = self.doHighlightRange( | ||||||
|  |                 { | ||||||
|  |                     "text": to_diff["text"], | ||||||
|  |                     "highlightRanges": to_diff["highlightRanges"], | ||||||
|  |                     'offset': offset, | ||||||
|  |                     'lineNumber': to_diff["lineNumber"], | ||||||
|  |                 } | ||||||
|  |             ) | ||||||
|  |             diffops = [ | ||||||
|  |                 (type(op)(None, None, op.b1, op.b2), [], bseq) | ||||||
|  |                 for op, _, bseq in diffops | ||||||
|  |                 if isinstance(op, Insert) or isinstance(op, Equal) | ||||||
|  |             ] | ||||||
|  |             yield from diffops | ||||||
|  | 
 | ||||||
|  |     def match_parmoves_exact(self, from_diff, to_diff): | ||||||
|  |         ops, from_tokens, to_tokens = list(zip(*self.doParMove(from_diff, to_diff))) | ||||||
|  |         from_text = "".join(chain.from_iterable(from_tokens)) | ||||||
|  |         # we know they match if we apply the highlight ranges and the "from" tokens equal the lhs tokens. | ||||||
|  |         if from_text == from_diff["text"]: | ||||||
|  |             print("MATCH FOUND") | ||||||
|  |             return True | ||||||
|  |         else: | ||||||
|  |             print("NO MATCH") | ||||||
|  |             print(len(from_text)) | ||||||
|  |             print(len(from_diff["text"])) | ||||||
|  |             return False | ||||||
|  | 
 | ||||||
|         # mwpersistence expects differences to be represented in order from the |         # mwpersistence expects differences to be represented in order from the | ||||||
|         # result's perspective ("to"), not the previous text. Thus, if a line |         # result's perspective ("to"), not the previous text. Thus, if a line | ||||||
|         # is moved earlier then its insertion should appear before its deletion. |         # is moved earlier then its insertion should appear before its deletion. | ||||||
| @ -230,12 +215,12 @@ class DiffToOperationMap: | |||||||
|         # strictly increasing, while the "from" segments should merely be |         # strictly increasing, while the "from" segments should merely be | ||||||
|         # non-overlapping. |         # non-overlapping. | ||||||
| 
 | 
 | ||||||
|     def doEqual(self, equal_segment, offset, update_idx="all"): |     def doEqual(self, entry): | ||||||
|         # if from_token_start is None: |         equal_segment, offset, lineNumber = ( | ||||||
|         #     from_token_start = self.n_from_start_tokens |             entry["text"], | ||||||
|         # if to_token_start is None: |             entry["offset"], | ||||||
|         #     to_token_start = self.n_to_start_tokens |             entry["lineNumber"], | ||||||
| 
 |         ) | ||||||
|         if isinstance(equal_segment, str): |         if isinstance(equal_segment, str): | ||||||
|             equal_bytes = equal_segment.encode() |             equal_bytes = equal_segment.encode() | ||||||
|         elif isinstance(equal_segment, bytes): |         elif isinstance(equal_segment, bytes): | ||||||
| @ -243,35 +228,28 @@ class DiffToOperationMap: | |||||||
|         else: |         else: | ||||||
|             raise ValueError(equal_segment) |             raise ValueError(equal_segment) | ||||||
| 
 | 
 | ||||||
|  |         self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(equal_bytes) | ||||||
|  |         self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(equal_bytes) | ||||||
|  | 
 | ||||||
|         tokens = self.tokenize(equal_bytes) |         tokens = self.tokenize(equal_bytes) | ||||||
|         n_tokens = len(tokens) |         n_tokens = len(tokens) | ||||||
| 
 |  | ||||||
|         # token_offset = self.get_token_offset(offset) |  | ||||||
| 
 |  | ||||||
|         # n_from_end_tokens = token_offset['from'] + n_tokens |  | ||||||
|         # n_to_end_tokens = token_offset['to'] + n_tokens |  | ||||||
| 
 |  | ||||||
|         yield ( |         yield ( | ||||||
|             Equal( |             Equal( | ||||||
|                 offset['from'], |                 offset["from"], | ||||||
|                 None, |                 None, | ||||||
|                 offset['to'], |                 offset["to"], | ||||||
|                 None, |                 None, | ||||||
|             ), |             ), | ||||||
|             tokens, |             tokens, | ||||||
|             tokens, |             tokens, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         # if update_idx in ["from", "all"]: |     def doInsert(self, entry): | ||||||
|         #     self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens |         insert_segment, offset, lineNumber = ( | ||||||
| 
 |             entry["text"], | ||||||
|         # if update_idx in ["to", "all"]: |             entry["offset"], | ||||||
|         #     self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens |             entry["lineNumber"], | ||||||
| 
 |         ) | ||||||
|         # self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens |  | ||||||
|         # self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens |  | ||||||
| 
 |  | ||||||
|     def doInsert(self, insert_segment, offset, update_idx="all"): |  | ||||||
|         if isinstance(insert_segment, str): |         if isinstance(insert_segment, str): | ||||||
|             insert_bytes = insert_segment.encode() |             insert_bytes = insert_segment.encode() | ||||||
|         elif isinstance(insert_segment, bytes): |         elif isinstance(insert_segment, bytes): | ||||||
| @ -279,23 +257,24 @@ class DiffToOperationMap: | |||||||
|         else: |         else: | ||||||
|             raise ValueError(insert_segment) |             raise ValueError(insert_segment) | ||||||
|         tokens = self.tokenize(insert_bytes) |         tokens = self.tokenize(insert_bytes) | ||||||
|         # n_tokens = len(tokens) |         self.to_linenumber_bytes_map[lineNumber] = offset["to"] + len(insert_bytes) | ||||||
|         # token_offset = self.get_token_offset(offset) |  | ||||||
|         # n_to_end_tokens = token_offset['to'] + n_tokens |  | ||||||
|         yield ( |         yield ( | ||||||
|             Insert( |             Insert( | ||||||
|                 None, |                 None, | ||||||
|                 None, |                 None, | ||||||
|                 offset['to'], |                 offset["to"], | ||||||
|                 None, |                 None, | ||||||
|             ), |             ), | ||||||
|             [], |             [], | ||||||
|             tokens, |             tokens, | ||||||
|         ) |         ) | ||||||
|         # We have now used more of the "to" tokens. |  | ||||||
|         #self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens |  | ||||||
| 
 | 
 | ||||||
|     def doDelete(self, delete_segment, offset, update_idx="all", type=str): |     def doDelete(self, entry): | ||||||
|  |         delete_segment, offset, lineNumber = ( | ||||||
|  |             entry["text"], | ||||||
|  |             entry["offset"], | ||||||
|  |             entry.get("lineNumber", None), | ||||||
|  |         ) | ||||||
|         if isinstance(delete_segment, str): |         if isinstance(delete_segment, str): | ||||||
|             delete_bytes = delete_segment.encode() |             delete_bytes = delete_segment.encode() | ||||||
|         elif isinstance(delete_segment, bytes): |         elif isinstance(delete_segment, bytes): | ||||||
| @ -303,26 +282,22 @@ class DiffToOperationMap: | |||||||
|         else: |         else: | ||||||
|             raise ValueError(delete_segment) |             raise ValueError(delete_segment) | ||||||
|         tokens = self.tokenize(delete_bytes) |         tokens = self.tokenize(delete_bytes) | ||||||
|         # n_tokens = len(tokens) |         if lineNumber is not None: | ||||||
| 
 |             self.from_linenumber_bytes_map[lineNumber] = offset["from"] + len(delete_bytes) | ||||||
|         # token_offset = self.get_token_offset(offset) |  | ||||||
|         # n_from_end_tokens = token_offset['from'] + n_tokens |  | ||||||
| 
 | 
 | ||||||
|         yield ( |         yield ( | ||||||
|             Delete( |             Delete(offset["from"], None, None, None), | ||||||
|                 offset['from'], |  | ||||||
|                 None, |  | ||||||
|                 None, |  | ||||||
|                 None |  | ||||||
|             ), |  | ||||||
|             tokens, |             tokens, | ||||||
|             [], |             [], | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         #self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens |     def doHighlightRange(self, entry): | ||||||
| 
 |         highlight_text, highlightRanges, offset, lineNumber = ( | ||||||
|     def doHighlightRange( |             entry["text"], | ||||||
|             self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"): |             entry["highlightRanges"], | ||||||
|  |             entry["offset"], | ||||||
|  |             entry["lineNumber"], | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         # The text field is an overlapping mix of both the from and to, |         # The text field is an overlapping mix of both the from and to, | ||||||
|         # so we need to handle it highlight-by-highlight. |         # so we need to handle it highlight-by-highlight. | ||||||
| @ -334,15 +309,22 @@ class DiffToOperationMap: | |||||||
| 
 | 
 | ||||||
|         # it's possible for offset['to'] to be null. |         # it's possible for offset['to'] to be null. | ||||||
|         # we can get it from the line number? |         # we can get it from the line number? | ||||||
|         update_linenumber_map = True |         # this bit is a little hacky as it deals with ideosyncratic wikidiff2 behavior | ||||||
|         if offset["to"] is None: |         if offset["to"] is None: | ||||||
|             keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1 |             # if the line already exists, we insert before it. | ||||||
|             if keyidx > 0: |             if lineNumber in self.to_linenumber_bytes_map: | ||||||
|                 print(self.to_linenumber_bytes_map) |                 keyidx = self.to_linenumber_bytes_map.bisect_left(lineNumber) - 1 | ||||||
|                 key = self.to_linenumber_bytes_map.keys()[keyidx] |  | ||||||
|                 offset["to"] = self.to_linenumber_bytes_map[key] |  | ||||||
|             else: |             else: | ||||||
|  |                 keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1 | ||||||
|  |             key = None | ||||||
|  |             if keyidx == -1: | ||||||
|                 offset["to"] = 0 |                 offset["to"] = 0 | ||||||
|  |             elif len(self.to_linenumber_bytes_map.keys()) > 0: | ||||||
|  |                 key = self.to_linenumber_bytes_map.keys()[keyidx] | ||||||
|  |             else: | ||||||
|  |                 key = 0 | ||||||
|  |             if key is not None: | ||||||
|  |                 offset["to"] = self.to_linenumber_bytes_map.get(key, 0) | ||||||
| 
 | 
 | ||||||
|         highlight_offset = offset |         highlight_offset = offset | ||||||
|         # note that diffs are token-level, but the indexes are byte-level |         # note that diffs are token-level, but the indexes are byte-level | ||||||
| @ -353,13 +335,16 @@ class DiffToOperationMap: | |||||||
|             if highlight_start > highlight_end: |             if highlight_start > highlight_end: | ||||||
|                 equal_bytes = highlight_bytes[highlight_end:highlight_start] |                 equal_bytes = highlight_bytes[highlight_end:highlight_start] | ||||||
|                 n_equal_bytes = len(equal_bytes) |                 n_equal_bytes = len(equal_bytes) | ||||||
|  | 
 | ||||||
|                 yield from self.doEqual( |                 yield from self.doEqual( | ||||||
|                     equal_bytes, highlight_offset, update_idx=update_idx |                     { | ||||||
|  |                         "text": equal_bytes, | ||||||
|  |                         "offset": highlight_offset, | ||||||
|  |                         "lineNumber": lineNumber, | ||||||
|  |                     } | ||||||
|                 ) |                 ) | ||||||
|                 highlight_offset["from"] += n_equal_bytes |                 highlight_offset["from"] += n_equal_bytes | ||||||
|                 highlight_offset["to"] += n_equal_bytes |                 highlight_offset["to"] += n_equal_bytes | ||||||
|                 if update_linenumber_map: |  | ||||||
|                     self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] |  | ||||||
| 
 | 
 | ||||||
|             # handle highlighted insert / delete |             # handle highlighted insert / delete | ||||||
|             highlight_end = highlight_start + highlightRange["length"] |             highlight_end = highlight_start + highlightRange["length"] | ||||||
| @ -368,14 +353,20 @@ class DiffToOperationMap: | |||||||
| 
 | 
 | ||||||
|             if highlightRange["type"] == 0: |             if highlightRange["type"] == 0: | ||||||
|                 yield from self.doInsert( |                 yield from self.doInsert( | ||||||
|                     range_bytes, highlight_offset, update_idx=update_idx |                     { | ||||||
|  |                         "text": range_bytes, | ||||||
|  |                         "offset": highlight_offset, | ||||||
|  |                         "lineNumber": lineNumber, | ||||||
|  |                     } | ||||||
|                 ) |                 ) | ||||||
|                 highlight_offset["to"] += n_range_bytes |                 highlight_offset["to"] += n_range_bytes | ||||||
|                 if update_linenumber_map: |  | ||||||
|                     self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] |  | ||||||
|             elif highlightRange["type"] == 1: |             elif highlightRange["type"] == 1: | ||||||
|                 yield from self.doDelete( |                 yield from self.doDelete( | ||||||
|                     range_bytes, highlight_offset, update_idx=update_idx |                     { | ||||||
|  |                         "text": range_bytes, | ||||||
|  |                         "offset": highlight_offset, | ||||||
|  |                         "lineNumber": lineNumber, | ||||||
|  |                     } | ||||||
|                 ) |                 ) | ||||||
|                 highlight_offset["from"] += n_range_bytes |                 highlight_offset["from"] += n_range_bytes | ||||||
|             else: |             else: | ||||||
| @ -384,16 +375,25 @@ class DiffToOperationMap: | |||||||
|         # handle the rest of the line which is equal |         # handle the rest of the line which is equal | ||||||
|         if highlight_end < len(highlight_bytes): |         if highlight_end < len(highlight_bytes): | ||||||
|             range_bytes = highlight_bytes[highlight_end:] |             range_bytes = highlight_bytes[highlight_end:] | ||||||
|             yield from self.doEqual(range_bytes, highlight_offset) |             yield from self.doEqual( | ||||||
|  |                 { | ||||||
|  |                     "text": range_bytes, | ||||||
|  |                     "offset": highlight_offset, | ||||||
|  |                     "lineNumber": lineNumber, | ||||||
|  |                 } | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|     def doParMove(self, from_diff, to_diff): |     def doParMove(self, from_diff, to_diff): | ||||||
|         # the tricky part here is to put the tokens in the right spots. |  | ||||||
|         from_byte_start = from_diff["offset"]["from"] |         from_byte_start = from_diff["offset"]["from"] | ||||||
|         to_byte_start = to_diff["offset"]["to"] |         to_byte_start = to_diff["offset"]["to"] | ||||||
|         offset = {"from": from_byte_start, "to": to_byte_start} |         offset = {"from": from_byte_start, "to": to_byte_start} | ||||||
|         # we need to cache the indexes; replace them; then restore |  | ||||||
|         yield from self.doHighlightRange( |         yield from self.doHighlightRange( | ||||||
|             to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"] |             { | ||||||
|  |                 "text": to_diff["text"], | ||||||
|  |                 "highlightRanges": to_diff["highlightRanges"], | ||||||
|  |                 'offset': offset, | ||||||
|  |                 'lineNumber': to_diff["lineNumber"], | ||||||
|  |             } | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -403,9 +403,13 @@ class WikiDiffMatcher: | |||||||
|         texts: list[str] = None, |         texts: list[str] = None, | ||||||
|         tokenizer: Optional[RegexTokenizer] = None, |         tokenizer: Optional[RegexTokenizer] = None, | ||||||
|         url: Optional[str] = "http://127.0.0.1:8000", |         url: Optional[str] = "http://127.0.0.1:8000", | ||||||
|  |         server=False | ||||||
|     ): |     ): | ||||||
|         # Pre-compute diffs to reduce traffic overhead. |         # Pre-compute diffs to reduce traffic overhead. | ||||||
|         self.diffs = compute_diffs(url, texts) |         if server is True: | ||||||
|  |             self.diffs = list(compute_diffs_server(list(texts),url)) | ||||||
|  |         else: | ||||||
|  |             self.diffs = list(compute_diffs(list(texts))) | ||||||
|         self.tokenizer = tokenizer or TOKENIZER |         self.tokenizer = tokenizer or TOKENIZER | ||||||
| 
 | 
 | ||||||
|     class Processor(DiffEngine.Processor): |     class Processor(DiffEngine.Processor): | ||||||
| @ -429,36 +433,33 @@ class WikiDiffMatcher: | |||||||
|             # this happens when revisions are actually equal. |             # this happens when revisions are actually equal. | ||||||
|             if len(diffops) == 0: |             if len(diffops) == 0: | ||||||
|                 self.last_tokens = self.tokenizer.tokenize(text) |                 self.last_tokens = self.tokenizer.tokenize(text) | ||||||
|                 ops = [Equal(0, len(self.last_tokens), |                 ops = [Equal(0, len(self.last_tokens), 0, len(self.last_tokens))] | ||||||
|                              0, len(self.last_tokens))] |  | ||||||
|                 return ops, self.last_tokens, self.last_tokens |                 return ops, self.last_tokens, self.last_tokens | ||||||
| 
 | 
 | ||||||
|             # we get back the byte indices; now we transform to token indices |             # we get back the byte indices; now we transform to token indices | ||||||
| 
 | 
 | ||||||
|             diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)) |             diffops.sort( | ||||||
|             aorder_ops = []             |                 key=lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1) | ||||||
|  |             ) | ||||||
|  |             aorder_ops = [] | ||||||
|             token_offset = 0 |             token_offset = 0 | ||||||
|             _, aseq, _ = list(zip( * diffops)) |             _, aseq, _ = list(zip(*diffops)) | ||||||
| 
 | 
 | ||||||
|             for op, tokens, _ in diffops: |             for op, tokens, _ in diffops: | ||||||
|                 a1 = token_offset |                 a1 = token_offset | ||||||
|                 if isinstance(op, Equal) or isinstance(op, Delete): |                 if isinstance(op, Equal) or isinstance(op, Delete): | ||||||
|                     token_offset += len(tokens) |                     token_offset += len(tokens) | ||||||
|                     a2 = token_offset |                     a2 = token_offset | ||||||
|                     aorder_ops.append(type(op)(a1, |                     aorder_ops.append(type(op)(a1, a2, op.b1, op.b1)) | ||||||
|                                                a2, |  | ||||||
|                                                op.b1, |  | ||||||
|                                                op.b1)) |  | ||||||
|                 else: |                 else: | ||||||
|                     aorder_ops.append(Insert(a1, |                     aorder_ops.append(Insert(a1, a1, op.b1, op.b1)) | ||||||
|                                              a1, |  | ||||||
|                                              op.b1, |  | ||||||
|                                              op.b1)) |  | ||||||
| 
 | 
 | ||||||
|             _, aseq, bseq = zip(* diffops) |             _, aseq, bseq = zip(*diffops) | ||||||
|             diffops = list(zip(aorder_ops, aseq, bseq)) |             diffops = list(zip(aorder_ops, aseq, bseq)) | ||||||
|             diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)) |             diffops.sort( | ||||||
|             _, _, bseq = list(zip(* diffops)) |                 key=lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1) | ||||||
|  |             ) | ||||||
|  |             _, _, bseq = list(zip(*diffops)) | ||||||
|             border_ops = [] |             border_ops = [] | ||||||
|             token_offset = 0 |             token_offset = 0 | ||||||
|             for op, _, tokens in diffops: |             for op, _, tokens in diffops: | ||||||
| @ -466,16 +467,10 @@ class WikiDiffMatcher: | |||||||
|                 if isinstance(op, Equal) or isinstance(op, Insert): |                 if isinstance(op, Equal) or isinstance(op, Insert): | ||||||
|                     token_offset += len(tokens) |                     token_offset += len(tokens) | ||||||
|                     b2 = token_offset |                     b2 = token_offset | ||||||
|                     border_ops.append(type(op)(op.a1, |                     border_ops.append(type(op)(op.a1, op.a2, b1, b2)) | ||||||
|                                                op.a2, |  | ||||||
|                                                b1, |  | ||||||
|                                                b2)) |  | ||||||
|                 else: |                 else: | ||||||
|                     border_ops.append(type(op)(op.a1, |                     border_ops.append(type(op)(op.a1, op.a2, b1, b1)) | ||||||
|                                                op.a2, | 
 | ||||||
|                                                b1, |  | ||||||
|                                                b1)) |  | ||||||
|              |  | ||||||
|             self.previous_text = text |             self.previous_text = text | ||||||
| 
 | 
 | ||||||
|             self.last_tokens = list(chain.from_iterable(aseq)) |             self.last_tokens = list(chain.from_iterable(aseq)) | ||||||
|  | |||||||
| @ -1,5 +1,9 @@ | |||||||
| <?php | <?php | ||||||
| 
 | 
 | ||||||
|  | header("Cache-Control: no-store, no-cache, must-revalidate, max-age=0"); | ||||||
|  | header("Cache-Control: post-check=0, pre-check=0", false); | ||||||
|  | header("Pragma: no-cache"); | ||||||
|  | 
 | ||||||
| // Launch this server with:
 | // Launch this server with:
 | ||||||
| // php -S localhost:8000 -q -c php.ini
 | // php -S localhost:8000 -q -c php.ini
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user