almost there. working out edge cases.
This commit is contained in:
		
							parent
							
								
									cf1fb61a84
								
							
						
					
					
						commit
						4654911533
					
				| @ -12,6 +12,7 @@ dependencies = [ | |||||||
|     "mwtypes>=0.4.0", |     "mwtypes>=0.4.0", | ||||||
|     "mwxml>=0.3.6", |     "mwxml>=0.3.6", | ||||||
|     "pyarrow>=20.0.0", |     "pyarrow>=20.0.0", | ||||||
|  |     "sortedcontainers>=2.4.0", | ||||||
|     "yamlconf>=0.2.6", |     "yamlconf>=0.2.6", | ||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| @ -22,6 +23,7 @@ deltas = { git = "https://github.com/groceryheist/deltas" } | |||||||
| 
 | 
 | ||||||
| [dependency-groups] | [dependency-groups] | ||||||
| dev = [ | dev = [ | ||||||
|  |     "ipython>=8.18.1", | ||||||
|     "pandas>=2.1.0", |     "pandas>=2.1.0", | ||||||
|     "pytest>=8.4.1", |     "pytest>=8.4.1", | ||||||
|     "pytest-asyncio>=1.0.0", |     "pytest-asyncio>=1.0.0", | ||||||
|  | |||||||
| @ -2,7 +2,7 @@ | |||||||
| import asyncio | import asyncio | ||||||
| import subprocess | import subprocess | ||||||
| from functools import partial | from functools import partial | ||||||
| 
 | import re | ||||||
| import pytest | import pytest | ||||||
| import pytest_asyncio | import pytest_asyncio | ||||||
| from typing import List | from typing import List | ||||||
| @ -30,11 +30,10 @@ async def start_stop_server(): | |||||||
| def assert_equal_enough(tokens:List[Token], rev): | def assert_equal_enough(tokens:List[Token], rev): | ||||||
|     # the tokens exclude newlines |     # the tokens exclude newlines | ||||||
|     # we allow extra whitespace at the beginning or end |     # we allow extra whitespace at the beginning or end | ||||||
|     token_doc = ''.join(str(t) for t in tokens).strip() |     token_doc = ''.join(str(t) for t in tokens) | ||||||
|     while '\n\n' in token_doc: |     token_doc = re.sub(r'\s+', ' ', token_doc).strip() | ||||||
|         token_doc = token_doc.replace('\n\n','\n') |     rev = re.sub(r'\s+', ' ', rev).strip() | ||||||
|     while '\n\n' in rev: | 
 | ||||||
|         rev = rev.replace('\n\n','\n').strip() |  | ||||||
|     print(token_doc, file = open('token','w')) |     print(token_doc, file = open('token','w')) | ||||||
|     print(rev, file = open('rev','w')) |     print(rev, file = open('rev','w')) | ||||||
|     assert token_doc == rev |     assert token_doc == rev | ||||||
| @ -62,7 +61,6 @@ def assert_correct_equal_section(ops, expected_equal_lines, expected_equal_token | |||||||
|     # if the last line is an equal |     # if the last line is an equal | ||||||
|     if first_unequal_token is None: |     if first_unequal_token is None: | ||||||
|         first_unequal_token = ops[-1].b2 |         first_unequal_token = ops[-1].b2 | ||||||
| 
 |  | ||||||
|      |      | ||||||
|     assert n_equal_lines == expected_equal_lines |     assert n_equal_lines == expected_equal_lines | ||||||
|     # check that there are no gaps and the number is as expected |     # check that there are no gaps and the number is as expected | ||||||
| @ -76,9 +74,8 @@ def test_equality(): | |||||||
|     diff_processor = matcher.processor() |     diff_processor = matcher.processor() | ||||||
|     ops, a, b = diff_processor.process(rev1) |     ops, a, b = diff_processor.process(rev1) | ||||||
|     ops, a, b = diff_processor.process(rev1 + " ") |     ops, a, b = diff_processor.process(rev1 + " ") | ||||||
|     assert len(ops) == 258 |     assert len(ops) == 257 | ||||||
|     for op in ops[:-2]: |     for op in ops[:-2]: | ||||||
|         print(op) |  | ||||||
|         assert isinstance(op, Equal) |         assert isinstance(op, Equal) | ||||||
| 
 | 
 | ||||||
|     # note that the whitespace token does not result in a token according to wikitext_split |     # note that the whitespace token does not result in a token according to wikitext_split | ||||||
| @ -152,44 +149,48 @@ def test_delete(): | |||||||
|     assert_equal_enough(b, rev2) |     assert_equal_enough(b, rev2) | ||||||
|     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
| 
 | 
 | ||||||
|     initial_equal_tokens = 0 |  | ||||||
|     first_nondelete_token = None |     first_nondelete_token = None | ||||||
|     n_deletes = 0 |     n_deletes = 0 | ||||||
|     n_deleted_tokens = 0 |     n_deleted_tokens = 0 | ||||||
|     last_b2 = initial_equal_tokens |     initial_equal_lines = 256 | ||||||
|  |     initial_equal_tokens = 9911 | ||||||
|  |     for i, op in enumerate(ops): | ||||||
|  |         if initial_equal_lines > 0: | ||||||
|  |             assert isinstance(op, Equal) | ||||||
|  |         else: | ||||||
|  |             break | ||||||
|  |         initial_equal_lines -= 1 | ||||||
|  | 
 | ||||||
|  |     assert initial_equal_lines == 0 | ||||||
|  |     assert ops[i-1].a2 - ops[0].a1 == initial_equal_tokens | ||||||
| 
 | 
 | ||||||
|     initial_equal_lines = 4 |  | ||||||
|     initial_equal_tokens = 14 |  | ||||||
|     last_b2 = assert_correct_equal_section(ops, |  | ||||||
|                                            expected_equal_lines=initial_equal_lines, |  | ||||||
|                                            expected_equal_tokens=initial_equal_tokens) |  | ||||||
|     first_noninsert_token = initial_equal_tokens |     first_noninsert_token = initial_equal_tokens | ||||||
|      |  | ||||||
| 
 | 
 | ||||||
|     last_non_delete = False |     last_delete = False | ||||||
|  |     last_insert = False | ||||||
|     idx = 0 |     idx = 0 | ||||||
|  |     n_non_delete = 0 | ||||||
|  | 
 | ||||||
|  |     last_delete_idx = 0 | ||||||
|     for op in ops[initial_equal_lines:]: |     for op in ops[initial_equal_lines:]: | ||||||
|         idx += 1 |         idx += 1 | ||||||
|         # deletes are interleaved with Equal newlines. |         if isinstance(op, Delete): | ||||||
|         if not isinstance(op, Delete): |  | ||||||
|             if last_non_delete: |  | ||||||
|                 first_nondelete_token = op.a1 |  | ||||||
|                 break |  | ||||||
|             last_non_delete = True |  | ||||||
|         else: |  | ||||||
|             last_non_delete = False |  | ||||||
|         if last_non_delete: |  | ||||||
|             n_deletes += 1 |             n_deletes += 1 | ||||||
|             n_deleted_tokens += op.a2 - last_b2 |             n_deleted_tokens += op.a2 - op.a1 | ||||||
|             last_b2 = op.a2 |             last_delete = True | ||||||
|         |             last_delete_idx = idx | ||||||
|     assert n_deletes == 2 |         # we need to add back a newline when we have a delete | ||||||
|     assert n_deleted_tokens == last_b2 - initial_equal_tokens == first_nondelete_token - initial_equal_tokens == 317 |         else: | ||||||
|  |             n_non_delete += 1 | ||||||
|  |             if not last_delete and first_nondelete_token is None: | ||||||
|  |                 first_nondelete_token = op.a1 | ||||||
| 
 | 
 | ||||||
| 
 |         if n_non_delete: | ||||||
|     last_b2 = assert_correct_equal_section(ops[initial_equal_lines + idx:], |             last_b2 = op.b2 | ||||||
|                                            expected_equal_lines=252, |              | ||||||
|                                            expected_equal_tokens=9765) |     assert n_deletes == 4 | ||||||
|  |     assert n_deleted_tokens == 320 | ||||||
|  |     assert idx == len(ops) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # first lets test that we properly build the operations.  | # first lets test that we properly build the operations.  | ||||||
| @ -204,14 +205,8 @@ def test_addition(): | |||||||
|     # so they reflect the state of the text according to the diff processor |     # so they reflect the state of the text according to the diff processor | ||||||
|     ops, a, b = diff_processor.process(rev1) |     ops, a, b = diff_processor.process(rev1) | ||||||
| 
 | 
 | ||||||
|     even = True |  | ||||||
|     for op in ops: |     for op in ops: | ||||||
|         if even: |         assert isinstance(op, Insert) | ||||||
|             assert isinstance(op, Insert) |  | ||||||
|             even = False |  | ||||||
|         else: |  | ||||||
|             assert isinstance(op, Equal) |  | ||||||
|             even = True |  | ||||||
| 
 | 
 | ||||||
|     assert_equal_enough(b, rev1) |     assert_equal_enough(b, rev1) | ||||||
|      |      | ||||||
| @ -221,8 +216,8 @@ def test_addition(): | |||||||
|     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
|     assert_equal_enough(b, rev2) |     assert_equal_enough(b, rev2) | ||||||
|     ops = list(ops) |     ops = list(ops) | ||||||
|     initial_equal_lines = 256 |     initial_equal_lines = 255 | ||||||
|     initial_equal_tokens = 9487 |     initial_equal_tokens = 9614 | ||||||
|     last_b2 = assert_correct_equal_section(ops, |     last_b2 = assert_correct_equal_section(ops, | ||||||
|                                            expected_equal_lines=initial_equal_lines, |                                            expected_equal_lines=initial_equal_lines, | ||||||
|                                            expected_equal_tokens=initial_equal_tokens) |                                            expected_equal_tokens=initial_equal_tokens) | ||||||
| @ -232,16 +227,21 @@ def test_addition(): | |||||||
|     n_inserted_tokens = 0 |     n_inserted_tokens = 0 | ||||||
|     last_b2 = last_insert_b2 = initial_equal_tokens |     last_b2 = last_insert_b2 = initial_equal_tokens | ||||||
|     idx = 0 |     idx = 0 | ||||||
|     print(ops[initial_equal_lines:]) | 
 | ||||||
|  |     last_insert = False | ||||||
|     for op in ops[initial_equal_lines:]: |     for op in ops[initial_equal_lines:]: | ||||||
|         if isinstance(op, Insert): |         if isinstance(op, Insert): | ||||||
|             n_inserts += 1 |             n_inserts += 1 | ||||||
|             n_inserted_tokens += op.b2 - op.b1 |             n_inserted_tokens += op.b2 - op.b1 | ||||||
|             last_insert_b2 = op.b2 |             last_insert_b2 = op.b2 | ||||||
|  |             last_insert = True | ||||||
|  |         elif last_insert: | ||||||
|  |             assert isinstance(op, Equal) | ||||||
|  |          | ||||||
|         last_b2 = op.b2 |         last_b2 = op.b2 | ||||||
| 
 | 
 | ||||||
|     assert n_inserted_tokens + 1 == last_insert_b2 - initial_equal_tokens == 293 |     assert n_inserted_tokens == last_insert_b2 - initial_equal_tokens == 296 | ||||||
|     assert n_inserts == 2 |     assert n_inserts == 4 | ||||||
| 
 | 
 | ||||||
| def test_paragraph_move(): | def test_paragraph_move(): | ||||||
|     rev1 = open("test/test_diff_revisions/1295229484").read() |     rev1 = open("test/test_diff_revisions/1295229484").read() | ||||||
| @ -269,6 +269,63 @@ def test_paragraph_move_and_change(): | |||||||
|     assert_equal_enough(a, rev1) |     assert_equal_enough(a, rev1) | ||||||
|     assert_equal_enough(b, rev2) |     assert_equal_enough(b, rev2) | ||||||
| 
 | 
 | ||||||
|  | def test_infobox(): | ||||||
|  |     rev1 = open("test/test_diff_revisions/test_infobox_from").read() | ||||||
|  |     rev2 = open("test/test_diff_revisions/test_infobox_to").read() | ||||||
|  |     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
|  |     diff_processor = matcher.processor() | ||||||
|  | 
 | ||||||
|  |     # note that a and b are constructed from the diffs. | ||||||
|  |     # so they reflect the state of the text according to the diff processor | ||||||
|  |     ops, a, b = diff_processor.process(rev1) | ||||||
|  |     ops, a, b = diff_processor.process(rev2) | ||||||
|  |     assert_equal_enough(b, rev2) | ||||||
|  |     assert_equal_enough(a, rev1) | ||||||
|  | 
 | ||||||
|  | def test_leading_whitespace(): | ||||||
|  |     rev1 = open("test/test_diff_revisions/test_leading_ws_from").read() | ||||||
|  |     rev2 = open("test/test_diff_revisions/test_leading_ws_to").read() | ||||||
|  |     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
|  |     diff_processor = matcher.processor() | ||||||
|  | 
 | ||||||
|  |     # note that a and b are constructed from the diffs. | ||||||
|  |     # so they reflect the state of the text according to the diff processor | ||||||
|  |     ops, a, b = diff_processor.process(rev1) | ||||||
|  |     ops, a, b = diff_processor.process(rev2) | ||||||
|  |     assert_equal_enough(b, rev2) | ||||||
|  |     assert_equal_enough(a, rev1) | ||||||
|  | 
 | ||||||
|  | # def test_whitespace_2(): | ||||||
|  | #     rev1 = open("test/test_diff_revisions/test_whitespace_2_from").read() | ||||||
|  | #     rev2 = open("test/test_diff_revisions/test_whitespace_2_to").read() | ||||||
|  | #     matcher = WikiDiffMatcher([rev1,rev2]) | ||||||
|  | #     diff_processor = matcher.processor() | ||||||
|  | 
 | ||||||
|  | #     # note that a and b are constructed from the diffs. | ||||||
|  | #     # so they reflect the state of the text according to the diff processor | ||||||
|  | #     ops, a, b = diff_processor.process(rev1) | ||||||
|  | #     ops, a, b = diff_processor.process(rev2) | ||||||
|  | #     assert_equal_enough(b, rev2) | ||||||
|  | #     assert_equal_enough(a, rev1) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_actually_equal(): | ||||||
|  |     rev1 = open("test/test_diff_revisions/1285792388").read() | ||||||
|  |     # whitespace is added because exact identity reverts do not result in diffs. | ||||||
|  |     matcher = WikiDiffMatcher([rev1,rev1]) | ||||||
|  |     diff_processor = matcher.processor() | ||||||
|  |     ops, a, b = diff_processor.process(rev1) | ||||||
|  |     ops, a, b = diff_processor.process(rev1) | ||||||
|  |     assert len(ops) == 1 | ||||||
|  |     assert isinstance(ops[0], Equal) | ||||||
|  | 
 | ||||||
|  |     # note that the whitespace token does not result in a token according to wikitext_split | ||||||
|  |     # compare the tokens based on the diffs to the baseline | ||||||
|  |     # whitespace differences are allowed | ||||||
|  |     assert_equal_enough(b, rev1) | ||||||
|  |     assert_equal_enough(a, rev1) | ||||||
|  |      | ||||||
| # slow test | # slow test | ||||||
| def test_diff_consistency(): | def test_diff_consistency(): | ||||||
|     from mwxml import Dump |     from mwxml import Dump | ||||||
|  | |||||||
| @ -3,25 +3,37 @@ import sys | |||||||
| from collections import namedtuple | from collections import namedtuple | ||||||
| from itertools import chain | from itertools import chain | ||||||
| from typing import Dict, Generator, List, Optional, Tuple | from typing import Dict, Generator, List, Optional, Tuple | ||||||
| 
 | from sortedcontainers import SortedDict | ||||||
| import requests | import requests | ||||||
| from deltas import (Delete, DiffEngine, Equal, Insert, Operation, Token, | from deltas import (Delete, DiffEngine, Equal, Insert, Operation, | ||||||
|                     RegexTokenizer, tokenizers) |                     RegexTokenizer, Token, tokenizers) | ||||||
| 
 | 
 | ||||||
| TOKENIZER = tokenizers.wikitext_split | TOKENIZER = tokenizers.wikitext_split | ||||||
| 
 | 
 | ||||||
|  | # def find_greatest_le_key(target_key, data_dict): | ||||||
|  | #     found_key = None | ||||||
|  | #     for key in data_dict:  # Iterates over keys in insertion order (which is sorted) | ||||||
|  | #         if key <= target_key: | ||||||
|  | #             found_key = ( | ||||||
|  | #                 key  # This is the largest key found so far that satisfies the condition | ||||||
|  | #             ) | ||||||
|  | #         else: | ||||||
|  | #             # Since the dictionary is sorted, if key > target_key, | ||||||
|  | #             # all subsequent keys will also be > target_key. | ||||||
|  | #             return found_key or key | ||||||
|  | 
 | ||||||
|  | # def find_smallest_gt_key(target_key, data_dict): | ||||||
|  | #     found_key = None | ||||||
|  | #     for key in reversed(data_dict):  # Iterates over keys in insertion order (which is sorted) | ||||||
|  | #         if key >= target_key: | ||||||
|  | #             found_key = ( | ||||||
|  | #                 key  # This is the largest key found so far that satisfies the condition | ||||||
|  | #             ) | ||||||
|  | #         else: | ||||||
|  | #             # Since the dictionary is sorted, if key > target_key, | ||||||
|  | #             # all subsequent keys will also be > target_key. | ||||||
|  | #             return found_key or key | ||||||
| 
 | 
 | ||||||
| def find_greatest_le_key(target_key, data_dict): |  | ||||||
|     found_key = None |  | ||||||
|     for key in data_dict:  # Iterates over keys in insertion order (which is sorted) |  | ||||||
|         if key <= target_key: |  | ||||||
|             found_key = ( |  | ||||||
|                 key  # This is the largest key found so far that satisfies the condition |  | ||||||
|             ) |  | ||||||
|         else: |  | ||||||
|             # Since the dictionary is sorted, if key > target_key, |  | ||||||
|             # all subsequent keys will also be > target_key. |  | ||||||
|             return found_key or key |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def compute_diffs(url: str, texts: list[str]) -> list: | def compute_diffs(url: str, texts: list[str]) -> list: | ||||||
| @ -61,79 +73,114 @@ class DiffToOperationMap: | |||||||
|         self.diff = json.loads(diff) |         self.diff = json.loads(diff) | ||||||
| 
 | 
 | ||||||
|         # the code below is designed to work in bytes because that's how wikidiff2 indexes |         # the code below is designed to work in bytes because that's how wikidiff2 indexes | ||||||
|         self.from_last_end_bytes = 0 |         # self.from_last_end_bytes = 0 | ||||||
|         self.from_last_to_bytes = 0 |         # self.from_last_to_bytes = 0 | ||||||
|         self.n_from_start_tokens = 0 |         # self.n_from_start_tokens = 0 | ||||||
|         self.n_from_end_tokens = 0 |         # self.n_from_end_tokens = 0 | ||||||
|         self.n_from_start_tokens = 0 |         # self.n_from_start_tokens = 0 | ||||||
|         self.n_to_start_tokens = 0 |         # self.n_to_start_tokens = 0 | ||||||
|         self.last_to_start_line = 0 |         # self.from_last_end_bytes = 0 | ||||||
|         self.last_from_start_line = 0 |         # self.to_last_end_bytes = 0 | ||||||
|         self.from_last_end_bytes = 0 |  | ||||||
|         self.to_last_end_bytes = 0 |  | ||||||
|         # keeps track of the number of tokens seen so far |         # keeps track of the number of tokens seen so far | ||||||
|         # to avoid repeated tokenization |         # to avoid repeated tokenization | ||||||
|         self.from_byte_token_index_map: Dict[int, int] = {} |         # self.from_byte_token_index_map: SortedDict[int, int] = SortedDict() | ||||||
|         self.to_byte_token_index_map: Dict[int, int] = {} |         # self.to_byte_token_index_map: SortedDict[int, int] = SortedDict() | ||||||
|         self.par_move_dict = {} |         self.par_move_dict = {} | ||||||
| 
 | 
 | ||||||
|         # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. |         # we need to keep track of the bytes of line numbers to recover when wikidiff2 loses offsets. | ||||||
|         self.to_linenumber_bytes_map = {} |         self.to_linenumber_bytes_map: SortedDict[int, int] = SortedDict() | ||||||
|  |         self.from_linenumber_bytes_map: SortedDict[int, int] = SortedDict() | ||||||
|  |     # def get_token_offset(self, byte_offset): | ||||||
|  |     #     from_token_start = None | ||||||
|  |     #     to_token_start = None | ||||||
|  |     #     from_last_end_bytes = self.from_byte_token_index_map.keys()[-1] | ||||||
|  |     #     to_last_end_bytes = self.to_byte_token_index_map.keys()[-1] | ||||||
|  |     #     if byte_offset['from'] is not None: | ||||||
|  |     #         if byte_offset['from'] < self.from_byte_token_index_map.values()[0]: | ||||||
|  |     #             from_token_start = 0 | ||||||
|  |     #         else: | ||||||
|  |     #             key = self.from_byte_token_index_map.bisect_key_right(byte_offset['from']) | ||||||
|  |     #         # this could be an issue; we assume that the next tokens are inserted at the end, but maybe they could go even further below? | ||||||
|  |     #         if key > from_last_end_bytes: | ||||||
|  |     #             from_token_start = self.from_byte_token_index_map[from_last_end_bytes] | ||||||
|  |     #         else: | ||||||
|  |     #             from_token_ | ||||||
|  |     #     if byte_offset['to'] is not None: | ||||||
|  |     #         if byte_offset['to'] < self.to_byte_token_index_map.values()[0]: | ||||||
|  |     #             to_token_start = 0 | ||||||
|  |     #         else: | ||||||
|  |     #             key = self.from_byte_token_index_map.bisect_key_right(byte_offset['to']) | ||||||
|  |     #         if key >= from | ||||||
|  |     #         if len(self.from_byte_token_index_map) > 0 and byte_offset['from'] != 0: | ||||||
|  |     #             if ( | ||||||
|  |     #                     byte_offset['from'] >= self.from_last_end_bytes | ||||||
|  |     #             ):  # if the from paragraph is at the end | ||||||
|  |     #                 from_token_start = next( | ||||||
|  |     #                     reversed(self.from_byte_token_index_map.values()) | ||||||
|  |     #                 ) | ||||||
|  |     #             else: | ||||||
|  |     #                 key = find_greatest_le_key( | ||||||
|  |     #                 byte_offset['from'], self.from_byte_token_index_map | ||||||
|  |     #                 ) | ||||||
|  |     #                 from_token_start = self.from_byte_token_index_map[key] | ||||||
|  |     #         else: | ||||||
|  |     #             from_token_start = 0 | ||||||
|  | 
 | ||||||
|  |     #     to_offset = None | ||||||
|  |     #     if byte_offset['to'] is not None: | ||||||
|  |     #         if len(self.to_byte_token_index_map) > 0: | ||||||
|  |     #             if to_byte_start >= self.to_last_end_bytes: | ||||||
|  |     #                 to_token_start = next(reversed(self.to_byte_token_index_map.values())) | ||||||
|  |     #             else: | ||||||
|  |     #                 key = find_smallest_gt_key(to_byte_start, self.to_byte_token_index_map) | ||||||
|  |     #                 to_token_start = self.to_byte_token_index_map[key] | ||||||
|  |     #         else: | ||||||
|  |     #             to_token_start = 0 | ||||||
|  | 
 | ||||||
|  |     #     return {'from': from_token_start, | ||||||
|  |     #             'to': to_token_start} | ||||||
| 
 | 
 | ||||||
|     def tokenize(self, bytes): |     def tokenize(self, bytes): | ||||||
|         return self.tokenizer.tokenize(bytes.decode("utf-8")) |         return self.tokenizer.tokenize(bytes.decode("utf-8")) | ||||||
| 
 | 
 | ||||||
|     def newline_result(self): |  | ||||||
|         self.n_from_end_tokens += 1 |  | ||||||
|         self.n_from_start_tokens += 1 |  | ||||||
|         self.n_to_end_tokens += 1 |  | ||||||
|         self.n_to_start_tokens +=1 |  | ||||||
|          |  | ||||||
|         return (Equal(self.n_from_start_tokens - 1, |  | ||||||
|                      self.n_from_end_tokens, |  | ||||||
|                      self.n_to_start_tokens - 1, |  | ||||||
|                      self.n_from_start_tokens), |  | ||||||
|                 [Token('\n')], |  | ||||||
|                 [Token('\n')]) |  | ||||||
|                       |  | ||||||
|      |  | ||||||
|     def to_operations(self): |     def to_operations(self): | ||||||
|         parmoves = [] | 
 | ||||||
|         [print(diff) for diff in self.diff["diff"][0:5]] |  | ||||||
|         for entry in self.diff["diff"]: |         for entry in self.diff["diff"]: | ||||||
|  | 
 | ||||||
|  |             entry["text"] += "\n" | ||||||
|  |             text = entry["text"] | ||||||
|             offset = entry["offset"] |             offset = entry["offset"] | ||||||
|  |             if offset["from"] and entry.get("lineNumber") is not None : | ||||||
|  |                 if entry['type'] in [0, 2, 3, 4]: | ||||||
|  |                     self.from_linenumber_bytes_map[entry["lineNumber"]] = offset["from"] + len(text.encode()) | ||||||
| 
 | 
 | ||||||
|             if offset["to"]: |             if offset["to"]: | ||||||
|                 self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] |                 if entry['type'] in [0, 1, 3, 5]: | ||||||
|  |                     self.to_linenumber_bytes_map[entry["lineNumber"]] = offset["to"] + len(text.encode()) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |             # add back the newline | ||||||
| 
 | 
 | ||||||
|             text = entry["text"] |  | ||||||
|             # ignore empty diffs. They don't have any tokens |  | ||||||
|             if len(text) == 0: |  | ||||||
|                 continue |  | ||||||
|             # this is the first byte of the line in the 'from' revision. |             # this is the first byte of the line in the 'from' revision. | ||||||
|             from_start_line = entry["offset"]["from"] |             from_start_line = entry["offset"]["from"] | ||||||
|             # this is the first byte of the line in the 'to' revision. |             # this is the first byte of the line in the 'to' revision. | ||||||
|             to_start_line = entry["offset"]["to"] |             to_start_line = entry["offset"]["to"] | ||||||
| 
 |              | ||||||
|             if entry["type"] == 0: |             if entry["type"] == 0: | ||||||
|                 yield from self.doEqual(text, offset) |                 yield from self.doEqual(text, offset) | ||||||
|                 yield self.newline_result() |  | ||||||
| 
 | 
 | ||||||
|                 # a line included in the 'to' revision, but not in the 'from' revision |                 # a line included in the 'to' revision, but not in the 'from' revision | ||||||
|             elif entry["type"] == 1: |             elif entry["type"] == 1: | ||||||
|                 yield from self.doInsert(text, offset) |                 yield from self.doInsert(text, offset) | ||||||
|                 yield self.newline_result() |  | ||||||
| 
 | 
 | ||||||
|                 # a line included in the 'from' revision, but not in the 'to' revision |                 # a line included in the 'from' revision, but not in the 'to' revision | ||||||
|             elif entry["type"] == 2: |             elif entry["type"] == 2: | ||||||
|                 yield from self.doDelete(text, offset) |                 yield from self.doDelete(text, offset) | ||||||
|                 yield self.newline_result()                 |  | ||||||
| 
 | 
 | ||||||
|             elif entry["type"] == 3: |             elif entry["type"] == 3: | ||||||
|                 yield from self.doHighlightRange( |                 yield from self.doHighlightRange( | ||||||
|                     text, entry["highlightRanges"], offset, entry["lineNumber"] |                     text, entry["highlightRanges"], offset, entry["lineNumber"] | ||||||
|                 ) |                 ) | ||||||
|                 yield self.newline_result() |  | ||||||
| 
 | 
 | ||||||
|             elif entry["type"] == 4: |             elif entry["type"] == 4: | ||||||
|                 self.par_move_dict[entry["moveInfo"]["id"]] = entry |                 self.par_move_dict[entry["moveInfo"]["id"]] = entry | ||||||
| @ -141,34 +188,37 @@ class DiffToOperationMap: | |||||||
|                 linkId = entry["moveInfo"]["linkId"] |                 linkId = entry["moveInfo"]["linkId"] | ||||||
|                 if linkId in self.par_move_dict: |                 if linkId in self.par_move_dict: | ||||||
|                     yield from self.doParMove(entry, self.par_move_dict[linkId]) |                     yield from self.doParMove(entry, self.par_move_dict[linkId]) | ||||||
|                     yield self.newline_result() |  | ||||||
|                 else: |  | ||||||
|                     # we need to count the tokens in the from revision so token index is correct |  | ||||||
|                     self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) |  | ||||||
|                     self.n_from_start_tokens += len( |  | ||||||
|                         self.tokenize(entry["text"].encode()) |  | ||||||
|                     ) |  | ||||||
| 
 | 
 | ||||||
|  |                 # we need to count the tokens in the from revision so token index is correct | ||||||
|  |                 # self.n_from_end_tokens += len(self.tokenize(entry["text"].encode())) | ||||||
|  |                 # self.n_from_start_tokens += len( | ||||||
|  |                 #     self.tokenize(entry["text"].encode()) | ||||||
|  |                 # ) | ||||||
|  |   | ||||||
|             elif entry["type"] == 5: |             elif entry["type"] == 5: | ||||||
|                 linkId = entry["moveInfo"]["linkId"] |                 linkId = entry["moveInfo"]["linkId"] | ||||||
|                 if linkId in self.par_move_dict: |                 if linkId in self.par_move_dict: | ||||||
|                     yield from self.doParMove(self.par_move_dict[linkId], entry) |                     yield from self.doParMove(self.par_move_dict[linkId], entry) | ||||||
|                     yield self.newline_result() |  | ||||||
|                 else: |                 else: | ||||||
|                     self.par_move_dict[entry["moveInfo"]["id"]] = entry |                     self.par_move_dict[entry["moveInfo"]["id"]] = entry | ||||||
|                     # call doHighlightRange just to update the token indices |                     # call doHighlightRange just to update the token indices | ||||||
|                     offset = { |                     # offset = { | ||||||
|                         "from": self.n_from_end_tokens, |                     #     "from": self.n_from_end_tokens, | ||||||
|                         "to": entry["offset"]["to"], |                     #     "to": entry["offset"]["to"], | ||||||
|                     } |                     # } | ||||||
|                     res = self.doHighlightRange( |                     # res = self.doHighlightRange( | ||||||
|                         entry["text"], |                     #     entry["text"], | ||||||
|                         entry["highlightRanges"], |                     #     entry["highlightRanges"], | ||||||
|                         offset, |                     #     offset, | ||||||
|                         entry["lineNumber"], |                     #     entry["lineNumber"], | ||||||
|                         update_idx="to", |                     #     update_idx="to", | ||||||
|                     ) |                     # ) | ||||||
|                     list(res) |                     # list(res) | ||||||
|  |                 # self.n_to_end_tokens += len(self.tokenize(entry["text"].encode())) | ||||||
|  |                 # self.n_to_start_tokens += len( | ||||||
|  |                 #     self.tokenize(entry["text"].encode()) | ||||||
|  |                 # ) | ||||||
|  | 
 | ||||||
|             else: |             else: | ||||||
|                 # The 'type' isn't one of the known |                 # The 'type' isn't one of the known | ||||||
|                 raise ValueError(d) |                 raise ValueError(d) | ||||||
| @ -180,99 +230,100 @@ class DiffToOperationMap: | |||||||
|         # strictly increasing, while the "from" segments should merely be |         # strictly increasing, while the "from" segments should merely be | ||||||
|         # non-overlapping. |         # non-overlapping. | ||||||
| 
 | 
 | ||||||
|     def doEqual(self, equal_segment, offset, update_idx="all", type=str): |     def doEqual(self, equal_segment, offset, update_idx="all"): | ||||||
|         if type is str: |         # if from_token_start is None: | ||||||
|  |         #     from_token_start = self.n_from_start_tokens | ||||||
|  |         # if to_token_start is None: | ||||||
|  |         #     to_token_start = self.n_to_start_tokens | ||||||
|  | 
 | ||||||
|  |         if isinstance(equal_segment, str): | ||||||
|             equal_bytes = equal_segment.encode() |             equal_bytes = equal_segment.encode() | ||||||
|         elif type is bytes: |         elif isinstance(equal_segment, bytes): | ||||||
|             equal_bytes = equal_segment |             equal_bytes = equal_segment | ||||||
|         else: |         else: | ||||||
|             raise ValueError(equal_segment) |             raise ValueError(equal_segment) | ||||||
| 
 | 
 | ||||||
|         tokens = self.tokenize(equal_bytes) |         tokens = self.tokenize(equal_bytes) | ||||||
|         n_tokens = len(tokens) |         n_tokens = len(tokens) | ||||||
|         n_from_end_tokens = self.n_from_start_tokens + n_tokens | 
 | ||||||
|         n_to_end_tokens = self.n_to_start_tokens + n_tokens |         # token_offset = self.get_token_offset(offset) | ||||||
|         # we need to keep track of the to and from last end bytes | 
 | ||||||
|         self.from_last_end_bytes = offset["from"] + len(equal_bytes) |         # n_from_end_tokens = token_offset['from'] + n_tokens | ||||||
|         self.to_last_end_bytes = offset["to"] + len(equal_bytes) |         # n_to_end_tokens = token_offset['to'] + n_tokens | ||||||
|  | 
 | ||||||
|         yield ( |         yield ( | ||||||
|             Equal( |             Equal( | ||||||
|                 self.n_from_start_tokens, |                 offset['from'], | ||||||
|                 n_from_end_tokens, |                 None, | ||||||
|                 self.n_to_start_tokens, |                 offset['to'], | ||||||
|                 n_to_end_tokens, |                 None, | ||||||
|             ), |             ), | ||||||
|             tokens, |             tokens, | ||||||
|             tokens, |             tokens, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         if update_idx in ["from", "all"]: |         # if update_idx in ["from", "all"]: | ||||||
|             self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens |         #     self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens | ||||||
| 
 | 
 | ||||||
|         if update_idx in ["to", "all"]: |         # if update_idx in ["to", "all"]: | ||||||
|             self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens |         #     self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens | ||||||
| 
 | 
 | ||||||
|         self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens |         # self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens | ||||||
|         self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens |         # self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens | ||||||
| 
 | 
 | ||||||
|     def doInsert(self, insert_segment, offset, update_idx="all", type=str): |     def doInsert(self, insert_segment, offset, update_idx="all"): | ||||||
|         if type is str: |         if isinstance(insert_segment, str): | ||||||
|             insert_bytes = insert_segment.encode() |             insert_bytes = insert_segment.encode() | ||||||
|         elif type is bytes: |         elif isinstance(insert_segment, bytes): | ||||||
|             insert_bytes = insert_segment |             insert_bytes = insert_segment | ||||||
|         else: |         else: | ||||||
|             raise ValueError(insert_segment) |             raise ValueError(insert_segment) | ||||||
|         tokens = self.tokenize(insert_bytes) |         tokens = self.tokenize(insert_bytes) | ||||||
|         n_tokens = len(tokens) |         # n_tokens = len(tokens) | ||||||
|         n_to_end_tokens = self.n_to_start_tokens + n_tokens |         # token_offset = self.get_token_offset(offset) | ||||||
|         self.to_last_end_bytes = offset["to"] + len(insert_bytes) |         # n_to_end_tokens = token_offset['to'] + n_tokens | ||||||
|         yield ( |         yield ( | ||||||
|             Insert( |             Insert( | ||||||
|                 self.n_from_start_tokens, |                 None, | ||||||
|                 self.n_from_start_tokens, |                 None, | ||||||
|                 self.n_to_start_tokens, |                 offset['to'], | ||||||
|                 n_to_end_tokens, |                 None, | ||||||
|             ), |             ), | ||||||
|             [], |             [], | ||||||
|             tokens, |             tokens, | ||||||
|         ) |         ) | ||||||
|         # We have now used more of the "to" tokens. |         # We have now used more of the "to" tokens. | ||||||
|         if update_idx in ["to", "all"]: |         #self.to_byte_token_index_map[offset["to"]] = n_to_end_tokens | ||||||
|             self.n_to_end_tokens = self.n_to_start_tokens = n_to_end_tokens |  | ||||||
| 
 |  | ||||||
|         self.to_byte_token_index_map[offset["to"]] = self.n_to_end_tokens |  | ||||||
| 
 | 
 | ||||||
|     def doDelete(self, delete_segment, offset, update_idx="all", type=str): |     def doDelete(self, delete_segment, offset, update_idx="all", type=str): | ||||||
|         if type is str: |         if isinstance(delete_segment, str): | ||||||
|             delete_bytes = delete_segment.encode() |             delete_bytes = delete_segment.encode() | ||||||
|         elif type is bytes: |         elif isinstance(delete_segment, bytes): | ||||||
|             delete_bytes = delete_segment |             delete_bytes = delete_segment | ||||||
|         else: |         else: | ||||||
|             raise ValueError(delete_segment) |             raise ValueError(delete_segment) | ||||||
|         tokens = self.tokenize(delete_bytes) |         tokens = self.tokenize(delete_bytes) | ||||||
|         n_tokens = len(tokens) |         # n_tokens = len(tokens) | ||||||
|  | 
 | ||||||
|  |         # token_offset = self.get_token_offset(offset) | ||||||
|  |         # n_from_end_tokens = token_offset['from'] + n_tokens | ||||||
| 
 | 
 | ||||||
|         n_from_end_tokens = self.n_from_start_tokens + n_tokens |  | ||||||
|         self.from_last_end_bytes = offset["from"] + len(delete_bytes) |  | ||||||
|         yield ( |         yield ( | ||||||
|             Delete( |             Delete( | ||||||
|                 self.n_from_start_tokens, |                 offset['from'], | ||||||
|                 n_from_end_tokens, |                 None, | ||||||
|                 self.n_to_start_tokens, |                 None, | ||||||
|                 self.n_to_start_tokens, |                 None | ||||||
|             ), |             ), | ||||||
|             tokens, |             tokens, | ||||||
|             [], |             [], | ||||||
|         ) |         ) | ||||||
|         # We have now used more of the "from" tokens. |  | ||||||
|         if update_idx in ["from", "all"]: |  | ||||||
|             self.n_from_end_tokens = self.n_from_start_tokens = n_from_end_tokens |  | ||||||
| 
 | 
 | ||||||
|         self.from_byte_token_index_map[offset["from"]] = self.n_from_end_tokens |         #self.from_byte_token_index_map[offset["from"]] = n_from_end_tokens | ||||||
| 
 | 
 | ||||||
|     def doHighlightRange( |     def doHighlightRange( | ||||||
|         self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all" |             self, highlight_text, highlightRanges, offset, lineNumber, update_idx="all"): | ||||||
|     ): | 
 | ||||||
|         # The text field is an overlapping mix of both the from and to, |         # The text field is an overlapping mix of both the from and to, | ||||||
|         # so we need to handle it highlight-by-highlight. |         # so we need to handle it highlight-by-highlight. | ||||||
|         # there can be gaps between highlight segments. |         # there can be gaps between highlight segments. | ||||||
| @ -283,11 +334,15 @@ class DiffToOperationMap: | |||||||
| 
 | 
 | ||||||
|         # it's possible for offset['to'] to be null. |         # it's possible for offset['to'] to be null. | ||||||
|         # we can get it from the line number? |         # we can get it from the line number? | ||||||
| 
 |         update_linenumber_map = True | ||||||
|         if offset["to"] is None: |         if offset["to"] is None: | ||||||
|             offset["to"] = self.from_byte_token_index_map[ |             keyidx = self.to_linenumber_bytes_map.bisect_right(lineNumber) - 1 | ||||||
|                 find_greatest_le_key(lineNumber, self.from_byte_token_index_map) |             if keyidx > 0: | ||||||
|             ] |                 print(self.to_linenumber_bytes_map) | ||||||
|  |                 key = self.to_linenumber_bytes_map.keys()[keyidx] | ||||||
|  |                 offset["to"] = self.to_linenumber_bytes_map[key] | ||||||
|  |             else: | ||||||
|  |                 offset["to"] = 0 | ||||||
| 
 | 
 | ||||||
|         highlight_offset = offset |         highlight_offset = offset | ||||||
|         # note that diffs are token-level, but the indexes are byte-level |         # note that diffs are token-level, but the indexes are byte-level | ||||||
| @ -299,10 +354,12 @@ class DiffToOperationMap: | |||||||
|                 equal_bytes = highlight_bytes[highlight_end:highlight_start] |                 equal_bytes = highlight_bytes[highlight_end:highlight_start] | ||||||
|                 n_equal_bytes = len(equal_bytes) |                 n_equal_bytes = len(equal_bytes) | ||||||
|                 yield from self.doEqual( |                 yield from self.doEqual( | ||||||
|                     equal_bytes, highlight_offset, update_idx=update_idx, type=bytes |                     equal_bytes, highlight_offset, update_idx=update_idx | ||||||
|                 ) |                 ) | ||||||
|                 highlight_offset["from"] += n_equal_bytes |                 highlight_offset["from"] += n_equal_bytes | ||||||
|                 highlight_offset["to"] += n_equal_bytes |                 highlight_offset["to"] += n_equal_bytes | ||||||
|  |                 if update_linenumber_map: | ||||||
|  |                     self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] | ||||||
| 
 | 
 | ||||||
|             # handle highlighted insert / delete |             # handle highlighted insert / delete | ||||||
|             highlight_end = highlight_start + highlightRange["length"] |             highlight_end = highlight_start + highlightRange["length"] | ||||||
| @ -311,12 +368,14 @@ class DiffToOperationMap: | |||||||
| 
 | 
 | ||||||
|             if highlightRange["type"] == 0: |             if highlightRange["type"] == 0: | ||||||
|                 yield from self.doInsert( |                 yield from self.doInsert( | ||||||
|                     range_bytes, highlight_offset, update_idx=update_idx, type=bytes |                     range_bytes, highlight_offset, update_idx=update_idx | ||||||
|                 ) |                 ) | ||||||
|                 highlight_offset["to"] += n_range_bytes |                 highlight_offset["to"] += n_range_bytes | ||||||
|  |                 if update_linenumber_map: | ||||||
|  |                     self.to_linenumber_bytes_map[lineNumber] = highlight_offset['to'] | ||||||
|             elif highlightRange["type"] == 1: |             elif highlightRange["type"] == 1: | ||||||
|                 yield from self.doDelete( |                 yield from self.doDelete( | ||||||
|                     range_bytes, highlight_offset, update_idx=update_idx, type=bytes |                     range_bytes, highlight_offset, update_idx=update_idx | ||||||
|                 ) |                 ) | ||||||
|                 highlight_offset["from"] += n_range_bytes |                 highlight_offset["from"] += n_range_bytes | ||||||
|             else: |             else: | ||||||
| @ -325,46 +384,14 @@ class DiffToOperationMap: | |||||||
|         # handle the rest of the line which is equal |         # handle the rest of the line which is equal | ||||||
|         if highlight_end < len(highlight_bytes): |         if highlight_end < len(highlight_bytes): | ||||||
|             range_bytes = highlight_bytes[highlight_end:] |             range_bytes = highlight_bytes[highlight_end:] | ||||||
|             yield from self.doEqual(range_bytes, highlight_offset, type=bytes) |             yield from self.doEqual(range_bytes, highlight_offset) | ||||||
| 
 | 
 | ||||||
|     def doParMove(self, from_diff, to_diff): |     def doParMove(self, from_diff, to_diff): | ||||||
|         # the tricky part here is to put the tokens in the right spots. |         # the tricky part here is to put the tokens in the right spots. | ||||||
|         from_byte_start = from_diff["offset"]["from"] |         from_byte_start = from_diff["offset"]["from"] | ||||||
|         # as of python 3.7 dictionaries are in insertion order. So |         to_byte_start = to_diff["offset"]["to"] | ||||||
|         # we can just find the first key that's greater |  | ||||||
| 
 |  | ||||||
|         # since the paragraph is removed in the "from" version, the index it is removed from |  | ||||||
|         # will be *after* the |  | ||||||
|         if len(self.from_byte_token_index_map) > 0: |  | ||||||
|             if ( |  | ||||||
|                 from_byte_start >= self.from_last_end_bytes |  | ||||||
|             ):  # if the from paragraph is at the end |  | ||||||
|                 from_token_start = next( |  | ||||||
|                     reversed(self.from_byte_token_index_map.values()) |  | ||||||
|                 ) |  | ||||||
|             else: |  | ||||||
|                 key = find_greatest_le_key( |  | ||||||
|                     from_byte_start, self.from_byte_token_index_map |  | ||||||
|                 ) |  | ||||||
|                 from_token_start = self.from_byte_token_index_map[key] |  | ||||||
|         else: |  | ||||||
|             from_token_start = 0 |  | ||||||
| 
 |  | ||||||
|         if len(self.to_byte_token_index_map) > 0: |  | ||||||
|             # get the to token index |  | ||||||
|             to_byte_start = to_diff["offset"]["to"] |  | ||||||
|             if to_byte_start >= self.to_last_end_bytes: |  | ||||||
|                 to_token_start = next(reversed(self.to_byte_token_index_map.values())) |  | ||||||
|             else: |  | ||||||
|                 key = find_greatest_le_key(to_byte_start, self.to_byte_token_index_map) |  | ||||||
|                 to_token_start = self.to_byte_token_index_map[key] |  | ||||||
|         else: |  | ||||||
|             to_token_start = 0 |  | ||||||
| 
 |  | ||||||
|         # now we set the state and apply the highlights |  | ||||||
|         self.n_from_start_tokens = self.n_from_end_tokens = from_token_start |  | ||||||
|         self.n_to_start_tokens = self.n_to_end_tokens = to_token_start |  | ||||||
|         offset = {"from": from_byte_start, "to": to_byte_start} |         offset = {"from": from_byte_start, "to": to_byte_start} | ||||||
|  |         # we need to cache the indexes; replace them; then restore | ||||||
|         yield from self.doHighlightRange( |         yield from self.doHighlightRange( | ||||||
|             to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"] |             to_diff["text"], to_diff["highlightRanges"], offset, to_diff["lineNumber"] | ||||||
|         ) |         ) | ||||||
| @ -397,35 +424,63 @@ class WikiDiffMatcher: | |||||||
|             diff = next(self.diffs) |             diff = next(self.diffs) | ||||||
|             diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer) |             diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer) | ||||||
| 
 | 
 | ||||||
|             diffops = list(zip(*diffToOperationsMapper.to_operations())) |             diffops = list(diffToOperationsMapper.to_operations()) | ||||||
| 
 | 
 | ||||||
|             if not diffops: |             # this happens when revisions are actually equal. | ||||||
|                 self.last_tokens = [] |             if len(diffops) == 0: | ||||||
|                 return [], [], [] |                 self.last_tokens = self.tokenizer.tokenize(text) | ||||||
|  |                 ops = [Equal(0, len(self.last_tokens), | ||||||
|  |                              0, len(self.last_tokens))] | ||||||
|  |                 return ops, self.last_tokens, self.last_tokens | ||||||
| 
 | 
 | ||||||
|             diffops = ( |             # we get back the byte indices; now we transform to token indices | ||||||
|                 operations, |  | ||||||
|                 aseq, |  | ||||||
|                 bseq, |  | ||||||
|             ) = diffops |  | ||||||
| 
 | 
 | ||||||
|             aseq = list(aseq) |             diffops.sort(key = lambda t: (t[0].a1 if t[0].a1 is not None else 1e32, t[0].b1)) | ||||||
|  |             aorder_ops = []             | ||||||
|  |             token_offset = 0 | ||||||
|  |             _, aseq, _ = list(zip( * diffops)) | ||||||
| 
 | 
 | ||||||
|             # aseq/bseq can be out of order, we need to sort it by a1/b1 index. |             for op, tokens, _ in diffops: | ||||||
|             indices = list(range(len(aseq))) |                 a1 = token_offset | ||||||
|             indices.sort(key=lambda i: operations[i].a1) |                 if isinstance(op, Equal) or isinstance(op, Delete): | ||||||
|             aseq = [aseq[i] for i in indices] |                     token_offset += len(tokens) | ||||||
|  |                     a2 = token_offset | ||||||
|  |                     aorder_ops.append(type(op)(a1, | ||||||
|  |                                                a2, | ||||||
|  |                                                op.b1, | ||||||
|  |                                                op.b1)) | ||||||
|  |                 else: | ||||||
|  |                     aorder_ops.append(Insert(a1, | ||||||
|  |                                              a1, | ||||||
|  |                                              op.b1, | ||||||
|  |                                              op.b1)) | ||||||
| 
 | 
 | ||||||
|             bseq = list(bseq) |             _, aseq, bseq = zip(* diffops) | ||||||
|             indices = list(range(len(bseq))) |             diffops = list(zip(aorder_ops, aseq, bseq)) | ||||||
|             indices.sort(key=lambda i: operations[i].b1) |             diffops.sort(key = lambda t: (t[0].b1 if t[0].b1 is not None else 1e32, t[0].a1)) | ||||||
|             bseq = [bseq[i] for i in indices] |             _, _, bseq = list(zip(* diffops)) | ||||||
|  |             border_ops = [] | ||||||
|  |             token_offset = 0 | ||||||
|  |             for op, _, tokens in diffops: | ||||||
|  |                 b1 = token_offset | ||||||
|  |                 if isinstance(op, Equal) or isinstance(op, Insert): | ||||||
|  |                     token_offset += len(tokens) | ||||||
|  |                     b2 = token_offset | ||||||
|  |                     border_ops.append(type(op)(op.a1, | ||||||
|  |                                                op.a2, | ||||||
|  |                                                b1, | ||||||
|  |                                                b2)) | ||||||
|  |                 else: | ||||||
|  |                     border_ops.append(type(op)(op.a1, | ||||||
|  |                                                op.a2, | ||||||
|  |                                                b1, | ||||||
|  |                                                b1)) | ||||||
|  |              | ||||||
|  |             self.previous_text = text | ||||||
| 
 | 
 | ||||||
|             self.last_tokens = list(chain.from_iterable(aseq)) |             self.last_tokens = list(chain.from_iterable(aseq)) | ||||||
|             tokens = list(chain.from_iterable(bseq)) |             tokens = list(chain.from_iterable(bseq)) | ||||||
|             self.previous_text = text |             return border_ops, self.last_tokens, tokens | ||||||
| 
 |  | ||||||
|             return operations, self.last_tokens, tokens |  | ||||||
| 
 | 
 | ||||||
|     def processor(self, *args, **kwargs): |     def processor(self, *args, **kwargs): | ||||||
|         return self.Processor(self.diffs, self.tokenizer) |         return self.Processor(self.diffs, self.tokenizer) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user