got wikidiff2 persistence working except for paragraph moves.
This commit is contained in:
		
							parent
							
								
									186cb82fb8
								
							
						
					
					
						commit
						5a3e4102b5
					
				
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -0,0 +1,3 @@ | |||||||
|  | [submodule "mediawiki-php-wikidiff2"] | ||||||
|  | 	path = mediawiki-php-wikidiff2 | ||||||
|  | 	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/ | ||||||
| @ -20,6 +20,13 @@ associated tests to work. | |||||||
| - 7zip | - 7zip | ||||||
| - ffmpeg | - ffmpeg | ||||||
| 
 | 
 | ||||||
|  | A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via. | ||||||
|  | 
 | ||||||
|  | ``apt-get install php-wikidiff2`` | ||||||
|  | 
 | ||||||
|  | You may have to also run. | ||||||
|  | ``sudo phpenmod wikidiff2``. | ||||||
|  | 
 | ||||||
| Tests | Tests | ||||||
| ---- | ---- | ||||||
| To run tests:: | To run tests:: | ||||||
| @ -30,3 +37,5 @@ TODO: | |||||||
| _______________ | _______________ | ||||||
| 1. [] Output metadata about the run. What parameters were used? What versions of deltas? | 1. [] Output metadata about the run. What parameters were used? What versions of deltas? | ||||||
| 2. [] Url encoding by default | 2. [] Url encoding by default | ||||||
|  | 
 | ||||||
|  | .. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2 | ||||||
|  | |||||||
| @ -3,7 +3,7 @@ name = "mediawiki-dump-tools" | |||||||
| version = "0.1.0" | version = "0.1.0" | ||||||
| description = "Add your description here" | description = "Add your description here" | ||||||
| readme = "README.md" | readme = "README.md" | ||||||
| requires-python = "~=3.9" | requires-python = ">=3.9" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|     "deltas>=0.7.0", |     "deltas>=0.7.0", | ||||||
|     "mediawiki-utilities>=0.4.18", |     "mediawiki-utilities>=0.4.18", | ||||||
| @ -18,8 +18,11 @@ dependencies = [ | |||||||
| [tool.uv.sources] | [tool.uv.sources] | ||||||
| yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | ||||||
| mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | ||||||
|  | deltas = { git = "https://github.com/groceryheist/deltas" } | ||||||
| 
 | 
 | ||||||
| [dependency-groups] | [dependency-groups] | ||||||
| dev = [ | dev = [ | ||||||
|     "pandas>=2.1.0" |     "pandas>=2.1.0", | ||||||
|  |     "pytest>=8.4.1", | ||||||
|  |     "pytest-asyncio>=1.0.0", | ||||||
| ] | ] | ||||||
|  | |||||||
| @ -1,8 +1,11 @@ | |||||||
| import json | import json | ||||||
| import sys | import sys | ||||||
|  | from itertools import chain | ||||||
|  | from typing import Generator, List, Optional, Tuple | ||||||
| 
 | 
 | ||||||
| import requests | import requests | ||||||
| from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete | from deltas import (Delete, DiffEngine, Equal, Insert, Operation, | ||||||
|  |                     RegexTokenizer, Token, tokenizers) | ||||||
| 
 | 
 | ||||||
| TOKENIZER = tokenizers.wikitext_split | TOKENIZER = tokenizers.wikitext_split | ||||||
| 
 | 
 | ||||||
| @ -15,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list: | |||||||
|         incremental_diffs = response.json() |         incremental_diffs = response.json() | ||||||
|     except requests.exceptions.ConnectionError as e: |     except requests.exceptions.ConnectionError as e: | ||||||
|         print( |         print( | ||||||
|             f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.") |             f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running." | ||||||
|  |         ) | ||||||
|         print(e) |         print(e) | ||||||
|         raise e |         raise e | ||||||
|     except requests.exceptions.HTTPError as e: |     except requests.exceptions.HTTPError as e: | ||||||
| @ -34,193 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list: | |||||||
|         print(f"An unexpected error occurred: {e}") |         print(f"An unexpected error occurred: {e}") | ||||||
|         raise e |         raise e | ||||||
| 
 | 
 | ||||||
|  |     # for diff in incremental_diffs: | ||||||
|  |     #     for wikidiffop in json.loads(diff)["diff"][0:5]: | ||||||
|  |     #         print(wikidiffop) | ||||||
|  | 
 | ||||||
|     return incremental_diffs |     return incremental_diffs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list: | class DiffToOperationMap: | ||||||
|     d = json.loads(diff) |  | ||||||
| 
 | 
 | ||||||
|     # the code below is designed to work in bytes because that's how wikidiff2 indexes |     def __init__(self, from_text, to_text, diff, tokenizer): | ||||||
|     from_text = from_text.encode('utf-8') |  | ||||||
|     to_text = to_text.encode('utf-8') |  | ||||||
| 
 | 
 | ||||||
|     # convinient function for tokenizing bytes |         self.diff = diff | ||||||
|     def tokenize(bytes): |         self.tokenizer = tokenizer | ||||||
|         return tokenizer.tokenize(bytes.decode('utf-8')) |         self.diff = json.loads(diff) | ||||||
| 
 | 
 | ||||||
|     # Keep track of the last difference we saw in order to notice unaccounted-for |         # the code below is designed to work in bytes because that's how wikidiff2 indexes | ||||||
|     # tokens. Each token at the end of "to" which is skipped for the next diff |         self.from_bytes = from_text.encode("utf-8") | ||||||
|     # must be represented as an "Equal()" segment. |         self.to_bytes = to_text.encode("utf-8") | ||||||
|     from_last_end_bytes = 0 |  | ||||||
|     to_last_end_bytes = 0 |  | ||||||
| 
 | 
 | ||||||
|     result = [] |  | ||||||
|     # DiffState expects differences to be represented in order from the |  | ||||||
|     # result's perspective ("to"), not the previous text. Thus, if a line |  | ||||||
|     # is moved earlier then its insertion should appear before its deletion. |  | ||||||
|     # As a rule of thumb, the "to" segments should be non-overlapping and |  | ||||||
|     # strictly increasing, while the "from" segments should merely be |  | ||||||
|     # non-overlapping. |  | ||||||
|      |  | ||||||
|     # wikidiff2 appears to follow this same convention, but this behavior |  | ||||||
|     # is not documented. |  | ||||||
| 
 | 
 | ||||||
|     # Note that, confusingly for Insert operations only the "to" indexes matter |         self.from_last_end_bytes = 0 | ||||||
|     # and for the Delete and Equal operations only the "from" indexes matter. |         self.from_last_to_bytes = 0 | ||||||
|     # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas` |         self.n_from_start_tokens = 0 | ||||||
|  |         self.n_to_start_tokens = 0 | ||||||
|  |         self.last_to_start_line = 0 | ||||||
|  |         self.last_from_start_line = 0 | ||||||
|  |         self.from_last_end_bytes = 0 | ||||||
|  |         self.to_last_end_bytes = 0 | ||||||
|  |          | ||||||
|  |     def tokenize(self, bytes): | ||||||
|  |         return self.tokenizer.tokenize(bytes.decode("utf-8")) | ||||||
| 
 | 
 | ||||||
|     parmove_from_dict = {} # lookup move diffs based on moveinfo id. |     def to_operations(self): | ||||||
|     parmove_to_dict = {}  |         parmove_from_dict = {}  # lookup move diffs based on moveinfo id. | ||||||
|      |         parmove_to_dict = {} | ||||||
|     for entry in d['diff']: |         for entry in self.diff["diff"]: | ||||||
|         linebytes = entry['text'].encode('utf-8') |             offset = entry['offset'] | ||||||
|         from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision. |             linebytes = entry["text"].encode("utf-8") | ||||||
|         to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision. |  | ||||||
| 
 | 
 | ||||||
|         from_start_tokens = len(tokenize(from_text[:from_start_line])) |             # ignore empty diffs. They don't have any tokens | ||||||
|         to_start_tokens = len(tokenize(to_text[:to_start_line])) |             if len(linebytes) == 0: | ||||||
|         # These constant calls to tokenizer.tokenize can definitely be optimized |                 continue | ||||||
|         # as tokenization is currently a bottleneck. Ideally tokenization would |             # this is the first byte of the line in the 'from' revision. | ||||||
|         # happen incrementally where possible, or somehow be cached, but this |             from_start_line = entry["offset"]["from"] | ||||||
|         # would be more complex. N: I think it's okay. CPU is cheap. |             # this is the first byte of the line in the 'to' revision. | ||||||
|  |             to_start_line = entry["offset"]["to"] | ||||||
| 
 | 
 | ||||||
|         if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0. |             if entry["type"] == 0: | ||||||
| 
 |                 yield from self.doEqual(linebytes, offset) | ||||||
|             line_tokens = len(tokenize(linebytes)) |  | ||||||
|             from_end_tokens = from_start_tokens + line_tokens |  | ||||||
|             to_end_tokens = to_start_tokens + line_tokens |  | ||||||
| 
 |  | ||||||
|             result.append(Equal(from_start_tokens, from_end_tokens, |  | ||||||
|                                 to_start_tokens, to_end_tokens)) |  | ||||||
| 
 |  | ||||||
|             # we need to keep track of the to and from last end bytes |  | ||||||
|             from_last_end_bytes += len(linebytes) |  | ||||||
|             to_last_end_bytes  += len(linebytes) |  | ||||||
| 
 |  | ||||||
|             continue |  | ||||||
|         else: |  | ||||||
|             # These do not appear to be generated by wikidiff2, and so must be |  | ||||||
|             # inferred. |  | ||||||
|             equal_tokens = to_start_tokens - to_last_end_bytes |  | ||||||
|             # If we notice that the next non-zero segment (which must be a |  | ||||||
|             # change, given that its type is non-zero), begins after the end |  | ||||||
|             # of the previous segment, we must add an Equal segment. |  | ||||||
|             # TODO: While the "to" token ranges are correct, |  | ||||||
|             # the "from" |  | ||||||
|             #  ranges are likely not, particularly in histories with paragraph |  | ||||||
|             #  moves. they can be corrected. |  | ||||||
|             if equal_tokens > 0: |  | ||||||
|                 # only the 'from' indexes matter |  | ||||||
|                 result.append(Equal(from_last_end_bytes, from_start_line, |  | ||||||
|                                     to_last_end_bytes, to_start_line)) |  | ||||||
| 
 |  | ||||||
|         if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision |  | ||||||
|             line_tokens = len(tokenize(linebytes)) |  | ||||||
|             to_end_tokens = to_start_tokens + line_tokens |  | ||||||
| 
 |  | ||||||
|             result.append(Insert(from_start_tokens, from_start_tokens, |  | ||||||
|                                  to_start_tokens, to_end_tokens, |  | ||||||
|                                  )) |  | ||||||
| 
 |  | ||||||
|             # We have now used more of the "to" tokens. |  | ||||||
|             to_start_end = to_end_tokens |  | ||||||
|              |              | ||||||
|         elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision |                 # a line included in the 'to' revision, but not in the 'from' revision | ||||||
|             line_tokens = len(tokenize(linebytes)) |             elif entry["type"] == 1: | ||||||
|             from_end_tokens = from_start_tokens + line_tokens |                 yield from self.doInsert(linebytes, offset) | ||||||
| 
 | 
 | ||||||
|             result.append(Delete(from_start_tokens, from_end_tokens, |                 # a line included in the 'from' revision, but not in the 'to' revision | ||||||
|                                  to_start_tokens, to_start_tokens, |             elif entry["type"] == 2: | ||||||
|                                  )) |                 yield from self.doDelete(linebytes, offset) | ||||||
| 
 |      | ||||||
|             # We have now used more of the "from" tokens. |             elif entry["type"] == 3: | ||||||
|             from_last_end_bytes = from_end_tokens |                 yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset) | ||||||
|              |              | ||||||
|         elif entry['type'] == 3: |             elif entry["type"] == 4: | ||||||
|             # The text field is an overlapping mix of both the from and to, |                 parmove_from_dict["moveInfo"]["id"] = diff | ||||||
|             # so we need to handle it highlight-by-highlight. |  | ||||||
|             # there can be gaps between highlight segments. |  | ||||||
|             # for instance, if a word is deleted from the middle of a line. |  | ||||||
|             # we need to track that.  |  | ||||||
|             highlight_last_end = 0 |  | ||||||
| 
 | 
 | ||||||
|             # note that diffs are token-level, but the indexes are byte-level |             elif entry["type"] == 5: | ||||||
|             for highlightRange in entry['highlightRanges']: |                 # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. | ||||||
|                 if highlightRange['start'] > highlight_last_end: |                 parmove_to_dict["moveInfo"]["id"] = diff | ||||||
|  |             # for deletions and equality report the token indexes from the 'from' revision. | ||||||
| 
 | 
 | ||||||
|                     equal_bytes = linebytes[highlight_last_end:highlightRange['start']] |             else: | ||||||
|                     equal_tokens = len(tokenize(equal_bytes)) |                 # The 'type' isn't one of the known | ||||||
|                     from_end_tokens = from_start_tokens + equal_tokens |                 raise ValueError(d) | ||||||
|                     to_end_tokens = to_end_tokens + equal_tokens |  | ||||||
|                     result.append(Equal(from_start_tokens, from_end_tokens, |  | ||||||
|                                         to_start_tokens, to_end_tokens |  | ||||||
|                                         )) |  | ||||||
| 
 | 
 | ||||||
|                     from_start_tokens = from_end_tokens |          | ||||||
|                     to_start_tokens = to_end_tokens |         # mwpersistence expects differences to be represented in order from the | ||||||
|                      |         # result's perspective ("to"), not the previous text. Thus, if a line | ||||||
|                 rangeStart = highlightRange['start'] |         # is moved earlier then its insertion should appear before its deletion. | ||||||
|                 rangeEnd = rangeStart + highlightRange['length'] |         # As a rule of thumb, the "to" segments should be non-overlapping and | ||||||
|                 range_bytes = linebytes[rangeStart:rangeEnd] |         # strictly increasing, while the "from" segments should merely be | ||||||
|                 range_tokens = len(tokenize(range_bytes)) |         # non-overlapping. | ||||||
|                 if highlightRange['type'] == 0: |  | ||||||
|                     # Insertion |  | ||||||
|                     to_end_tokens = to_start_tokens + range_tokens |  | ||||||
|                     result.append(Insert(from_start_tokens, from_end_tokens, |  | ||||||
|                                          to_start_tokens, to_end_tokens)) |  | ||||||
| 
 | 
 | ||||||
|                     to_start_tokens = to_end_tokens |         # now we go through the parmoves | ||||||
|                 elif highlightRange['type'] == 1: |         for id, from_diff in parmove_from_dict.items(): | ||||||
|                     # Deletion |             to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]] | ||||||
|                     from_end_tokens = from_start_tokens + range_tokens |              | ||||||
|                     result.append(Delete(from_start_tokens, from_end_tokens, |  | ||||||
|                                          to_start_tokens, to_end_tokens)) |  | ||||||
|                     from_start_tokens = from_end_tokens |  | ||||||
|                                           |  | ||||||
|                 else: |  | ||||||
|                     raise Exception(entry) |  | ||||||
|                  |  | ||||||
|                 highlight_last_end = highlightRange['start'] + highlightRange['length'] |  | ||||||
| 
 | 
 | ||||||
|         elif entry['type'] == 4: |     def doEqual(self, equal_bytes, offset): | ||||||
|  |         tokens = self.tokenize(equal_bytes) | ||||||
|  |         n_tokens = len(tokens) | ||||||
|  |         self.n_from_end_tokens = self.n_from_start_tokens + n_tokens | ||||||
|  |         self.n_to_end_tokens = self.n_to_start_tokens + n_tokens | ||||||
|  |         yield ( | ||||||
|  |             Equal( | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_from_end_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |                 self.n_to_end_tokens, | ||||||
|  |             ), | ||||||
|  |             tokens, | ||||||
|  |             tokens, | ||||||
|  |         ) | ||||||
|  |         # we need to keep track of the to and from last end bytes | ||||||
|  |         self.from_last_end_bytes = offset["from"] + len(equal_bytes) | ||||||
|  |         self.to_last_end_bytes = offset["to"] + len(equal_bytes) | ||||||
|  |         self.n_from_start_tokens += n_tokens | ||||||
|  |         self.n_to_start_tokens += n_tokens | ||||||
| 
 | 
 | ||||||
|             parmove_from_dict['moveInfo']['id'] = diff |  | ||||||
| 
 | 
 | ||||||
|         elif entry['type'] == 5: |     def doInsert(self, insert_bytes, offset): | ||||||
|  |         tokens = self.tokenize(insert_bytes) | ||||||
|  |         n_tokens = len(tokens) | ||||||
|  |         self.n_to_end_tokens = self.n_to_start_tokens + n_tokens | ||||||
|  |         yield ( | ||||||
|  |             Insert( | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |                 self.n_to_end_tokens, | ||||||
|  |             ), | ||||||
|  |             [], | ||||||
|  |             tokens, | ||||||
|  |         ) | ||||||
|  |         # We have now used more of the "to" tokens. | ||||||
|  |         self.n_to_start_tokens += n_tokens | ||||||
|  |         self.to_last_end_bytes = offset["to"] + len(insert_bytes) | ||||||
| 
 | 
 | ||||||
|             parmove_to_dict['moveInfo']['id'] = diff |     def doDelete(self, delete_bytes, offset): | ||||||
|             # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. |         tokens = self.tokenize(delete_bytes) | ||||||
|             # for deletions and equality report the token indexes from the 'from' revision.  |         n_tokens = len(tokens) | ||||||
|         else: |         self.n_from_end_tokens = self.n_from_start_tokens + n_tokens | ||||||
|             # The 'type' isn't one of the known |         yield ( | ||||||
|             raise ValueError(d) |             Delete( | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_from_end_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |             ), | ||||||
|  |             tokens, | ||||||
|  |             [], | ||||||
|  |         ) | ||||||
|  |         # We have now used more of the "from" tokens. | ||||||
|  |         self.n_from_start_tokens += n_tokens | ||||||
|  |         self.from_last_end_bytes = offset["from"] + len(delete_bytes) | ||||||
| 
 | 
 | ||||||
|     # now we go through the parmoves |     def doHighlightRange(self, highlight_bytes, highlightRanges, offset): | ||||||
|     for id, from_diff in parmove_from_dict.items(): |         # The text field is an overlapping mix of both the from and to, | ||||||
|         to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']] |         # so we need to handle it highlight-by-highlight. | ||||||
|         ### TODO calculate the correct token indexes. |         # there can be gaps between highlight segments. | ||||||
|  |         # for instance, if a word is deleted from the middle of a line. | ||||||
|  |         # we need to track that. | ||||||
|  |         highlight_end = 0 | ||||||
|  |         highlight_offset = offset | ||||||
|  |         # note that diffs are token-level, but the indexes are byte-level | ||||||
| 
 | 
 | ||||||
|     # TODO: Handle trailing tokens |         for highlightRange in highlightRanges: | ||||||
|  |             highlight_start = highlightRange["start"] | ||||||
|  |             # equal bytes in between highlights | ||||||
|  |             if highlight_start > highlight_end: | ||||||
|  | 
 | ||||||
|  |                 equal_bytes = highlight_bytes[ | ||||||
|  |                     highlight_end : highlight_start | ||||||
|  |                 ] | ||||||
|  |                 n_equal_bytes = len(equal_bytes) | ||||||
|  |                 yield from self.doEqual(equal_bytes, highlight_offset) | ||||||
|  |                 highlight_offset['from'] += n_equal_bytes | ||||||
|  |                 highlight_offset['to'] += n_equal_bytes | ||||||
|  | 
 | ||||||
|  |             # handle highlighted insert / delete | ||||||
|  |             highlight_end = highlight_start + highlightRange["length"] | ||||||
|  |             range_bytes = highlight_bytes[highlight_start:highlight_end] | ||||||
|  |             n_range_bytes = len(range_bytes) | ||||||
|  |             if highlightRange["type"] == 0: | ||||||
|  |                 yield from self.doInsert(range_bytes, highlight_offset) | ||||||
|  |                 highlight_offset['to'] += n_range_bytes | ||||||
|  |             elif highlightRange["type"] == 1: | ||||||
|  |                 yield from self.doDelete(range_bytes, highlight_offset) | ||||||
|  |                 highlight_offset['from'] += n_range_bytes | ||||||
|  |             else: | ||||||
|  |                 raise Exception(entry) | ||||||
|  | 
 | ||||||
|  |         # handle the rest of the line which is equal | ||||||
|  |         if highlight_end < len(highlight_bytes): | ||||||
|  |             range_bytes = highlight_bytes[highlight_end:] | ||||||
|  |             yield from self.doEqual(range_bytes, highlight_offset) | ||||||
| 
 | 
 | ||||||
|     # raise Exception(result) |  | ||||||
|     return result |  | ||||||
| 
 | 
 | ||||||
| class WikiDiffMatcher: | class WikiDiffMatcher: | ||||||
|     def __init__(self, |     def __init__( | ||||||
|                  url: str, |         self, | ||||||
|                  texts: list[str], |         texts: list[str] = None, | ||||||
|                  tokenizer: RegexTokenizer = None, |         tokenizer: Optional[RegexTokenizer] = None, | ||||||
|                  ): |         url: Optional[str] = "http://127.0.0.1:8000", | ||||||
|  |     ): | ||||||
|         # Pre-compute diffs to reduce traffic overhead. |         # Pre-compute diffs to reduce traffic overhead. | ||||||
|         self.diffs = compute_diffs(url, texts) |         self.diffs = compute_diffs(url, texts) | ||||||
|         self.tokenizer = tokenizer or TOKENIZER |         self.tokenizer = tokenizer or TOKENIZER | ||||||
| 
 | 
 | ||||||
|     class Processor(DiffEngine.Processor): |     class Processor(DiffEngine.Processor): | ||||||
|         def __init__(self, |         def __init__(self, texts, tokenizer=None): | ||||||
|                      diffs, |             self.diffs = iter(texts) | ||||||
|                      tokenizer=None |  | ||||||
|                      ): |  | ||||||
|             self.diffs = iter(diffs) |  | ||||||
|             self.tokenizer = tokenizer or TOKENIZER |             self.tokenizer = tokenizer or TOKENIZER | ||||||
|             self.last_tokens = [] |             self.last_tokens = [] | ||||||
|             self.previous_text = "" |             self.previous_text = "" | ||||||
| @ -229,28 +246,27 @@ class WikiDiffMatcher: | |||||||
|             self.last_tokens = last_tokens |             self.last_tokens = last_tokens | ||||||
| 
 | 
 | ||||||
|         def process(self, text, token_class=None): |         def process(self, text, token_class=None): | ||||||
|             # IDEs will report the method signature as incorrect, but this is |  | ||||||
|             # expected. The DiffEngine.Processor class must be inherited from, |  | ||||||
|             # and its process definition incorrectly excludes a "self" argument. |  | ||||||
| 
 |  | ||||||
|             # The diff has already been computed, but we need to incrementally |             # The diff has already been computed, but we need to incrementally | ||||||
|             # retrieve it to recreate the behavior DiffState expects. |             # retrieve it to recreate the behavior DiffState expects. | ||||||
|             diff = next(self.diffs) |             diff = next(self.diffs) | ||||||
|  |             diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer) | ||||||
|  |             ( | ||||||
|  |                 operations, | ||||||
|  |                 aseq, | ||||||
|  |                 bseq, | ||||||
|  |             ) = list( | ||||||
|  |                 zip(*diffToOperationsMapper.to_operations()) | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|             tokens = self.tokenizer.tokenize(text, token_class=token_class) |             self.last_tokens = list(chain.from_iterable(aseq)) | ||||||
|             operations = to_operations(self.previous_text, text, diff, self.tokenizer) |             tokens = list(chain.from_iterable(bseq)) | ||||||
| 
 |  | ||||||
|             a = self.last_tokens |  | ||||||
|             b = tokens |  | ||||||
|             self.last_tokens = tokens |  | ||||||
|             self.previous_text = text |             self.previous_text = text | ||||||
| 
 | 
 | ||||||
|             return operations, a, b |             return operations, self.last_tokens, tokens | ||||||
| 
 | 
 | ||||||
|     def processor(self, *args, **kwargs): |     def processor(self, *args, **kwargs): | ||||||
|         return self.Processor(self.diffs, self.tokenizer) |         return self.Processor(self.diffs, self.tokenizer) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|     def process(self): |     def process(self): | ||||||
|         # DiffState checks for this method even though it is not called. |         # DiffState checks for this method even though it is not called. | ||||||
|         raise Exception("Unnecessary implementation") |         raise Exception("Unnecessary implementation") | ||||||
|  | |||||||
| @ -17,7 +17,7 @@ $data = json_decode($rawData, true); | |||||||
| $previous = ''; | $previous = ''; | ||||||
| $result = []; | $result = []; | ||||||
| foreach ($data as $i => $value) { | foreach ($data as $i => $value) { | ||||||
|     $result[] = wikidiff2_inline_json_diff($previous, $value, 0); |     $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000); | ||||||
|     $previous = $value; |     $previous = $value; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
							
								
								
									
										9
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								wikiq
									
									
									
									
									
								
							| @ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will | |||||||
| If the pattern does not include a capture group, then only one output column will result. | If the pattern does not include a capture group, then only one output column will result. | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| class RegexPair(object): | class RegexPair(object): | ||||||
|     def __init__(self, pattern, label): |     def __init__(self, pattern, label): | ||||||
|         self.pattern = re.compile(pattern) |         self.pattern = re.compile(pattern) | ||||||
| @ -219,7 +218,7 @@ class WikiqParser: | |||||||
|                  revert_radius: int = 15, |                  revert_radius: int = 15, | ||||||
|                  output_parquet: bool = True, |                  output_parquet: bool = True, | ||||||
|                  parquet_buffer_size: int = 2000, |                  parquet_buffer_size: int = 2000, | ||||||
|                  wikidiff_url: str = "", |                  wikidiff_url: str = "http://127.0.0.1:8000", | ||||||
|                  ): |                  ): | ||||||
| 
 | 
 | ||||||
|         """  |         """  | ||||||
| @ -450,9 +449,9 @@ class WikiqParser: | |||||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), |                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), | ||||||
|                                                     revert_radius=PERSISTENCE_RADIUS) |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 elif self.persist == PersistMethod.wikidiff: |                 elif self.persist == PersistMethod.wikidiff: | ||||||
|                     state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url, |                     state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, | ||||||
|                                                                     revision_texts, |                                                                     tokenizer=wikitext_split, | ||||||
|                                                                     tokenizer=wikitext_split), |                                                                     self.wikidiff_url), | ||||||
|                                                     revert_radius=PERSISTENCE_RADIUS) |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 else: |                 else: | ||||||
|                     from mw.lib import persistence |                     from mw.lib import persistence | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user