Merge branch 'tmp' into compute-diffs
This commit is contained in:
		
						commit
						20de5b93f9
					
				
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -0,0 +1,3 @@ | |||||||
|  | [submodule "mediawiki-php-wikidiff2"] | ||||||
|  | 	path = mediawiki-php-wikidiff2 | ||||||
|  | 	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/ | ||||||
| @ -20,6 +20,13 @@ associated tests to work. | |||||||
| - 7zip | - 7zip | ||||||
| - ffmpeg | - ffmpeg | ||||||
| 
 | 
 | ||||||
|  | A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via. | ||||||
|  | 
 | ||||||
|  | ``apt-get install php-wikidiff2`` | ||||||
|  | 
 | ||||||
|  | You may have to also run. | ||||||
|  | ``sudo phpenmod wikidiff2``. | ||||||
|  | 
 | ||||||
| Tests | Tests | ||||||
| ---- | ---- | ||||||
| To run tests:: | To run tests:: | ||||||
| @ -30,3 +37,5 @@ TODO: | |||||||
| _______________ | _______________ | ||||||
| 1. [] Output metadata about the run. What parameters were used? What versions of deltas? | 1. [] Output metadata about the run. What parameters were used? What versions of deltas? | ||||||
| 2. [] Url encoding by default | 2. [] Url encoding by default | ||||||
|  | 
 | ||||||
|  | .. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2 | ||||||
|  | |||||||
| @ -3,7 +3,7 @@ name = "mediawiki-dump-tools" | |||||||
| version = "0.1.0" | version = "0.1.0" | ||||||
| description = "Add your description here" | description = "Add your description here" | ||||||
| readme = "README.md" | readme = "README.md" | ||||||
| requires-python = "~=3.9" | requires-python = ">=3.9" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|     "deltas>=0.7.0", |     "deltas>=0.7.0", | ||||||
|     "mediawiki-utilities>=0.4.18", |     "mediawiki-utilities>=0.4.18", | ||||||
| @ -18,8 +18,11 @@ dependencies = [ | |||||||
| [tool.uv.sources] | [tool.uv.sources] | ||||||
| yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | ||||||
| mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | ||||||
|  | deltas = { git = "https://github.com/groceryheist/deltas" } | ||||||
| 
 | 
 | ||||||
| [dependency-groups] | [dependency-groups] | ||||||
| dev = [ | dev = [ | ||||||
|     "pandas>=2.1.0" |     "pandas>=2.1.0", | ||||||
|  |     "pytest>=8.4.1", | ||||||
|  |     "pytest-asyncio>=1.0.0", | ||||||
| ] | ] | ||||||
|  | |||||||
| @ -1,10 +1,14 @@ | |||||||
| import json | import json | ||||||
| import sys | import sys | ||||||
|  | from itertools import chain | ||||||
|  | from typing import Generator, List, Optional, Tuple | ||||||
| 
 | 
 | ||||||
| import requests | import requests | ||||||
| from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete | from deltas import (Delete, DiffEngine, Equal, Insert, Operation, | ||||||
|  |                     RegexTokenizer, Token, tokenizers) | ||||||
|  | 
 | ||||||
|  | TOKENIZER = tokenizers.wikitext_split | ||||||
| 
 | 
 | ||||||
| TOKENIZER = tokenizers.text_split |  | ||||||
| 
 | 
 | ||||||
| def compute_diffs(url: str, texts: list[str]) -> list: | def compute_diffs(url: str, texts: list[str]) -> list: | ||||||
|     response = None |     response = None | ||||||
| @ -14,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list: | |||||||
|         incremental_diffs = response.json() |         incremental_diffs = response.json() | ||||||
|     except requests.exceptions.ConnectionError as e: |     except requests.exceptions.ConnectionError as e: | ||||||
|         print( |         print( | ||||||
|             f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.") |             f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running." | ||||||
|  |         ) | ||||||
|         print(e) |         print(e) | ||||||
|         raise e |         raise e | ||||||
|     except requests.exceptions.HTTPError as e: |     except requests.exceptions.HTTPError as e: | ||||||
| @ -33,161 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list: | |||||||
|         print(f"An unexpected error occurred: {e}") |         print(f"An unexpected error occurred: {e}") | ||||||
|         raise e |         raise e | ||||||
| 
 | 
 | ||||||
|  |     # for diff in incremental_diffs: | ||||||
|  |     #     for wikidiffop in json.loads(diff)["diff"][0:5]: | ||||||
|  |     #         print(wikidiffop) | ||||||
|  | 
 | ||||||
|     return incremental_diffs |     return incremental_diffs | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def to_operations(previous_text, next_text, diff, tokenizer: RegexTokenizer) -> list: | class DiffToOperationMap: | ||||||
|     d = json.loads(diff) |  | ||||||
| 
 | 
 | ||||||
|     # Keep track of the last difference we saw in order to notice unaccounted-for |     def __init__(self, from_text, to_text, diff, tokenizer): | ||||||
|     # tokens. Each token at the end of "to" which is skipped for the next diff |  | ||||||
|     # must be represented as an "Equal()" segment. |  | ||||||
|     from_last_end = 0 |  | ||||||
|     to_last_end = 0 |  | ||||||
| 
 | 
 | ||||||
|     result = [] |         self.diff = diff | ||||||
|     # DiffState expects differences to be represented in order from the |         self.tokenizer = tokenizer | ||||||
|     # result's perspective ("to"), not the previous text. Thus, if a line |         self.diff = json.loads(diff) | ||||||
|     # is moved earlier then its insertion should appear before its deletion. |  | ||||||
|     # As a rule of thumb, the "to" segments should be non-overlapping and |  | ||||||
|     # strictly increasing, while the "from" segments should merely be |  | ||||||
|     # non-overlapping. |  | ||||||
|     # |  | ||||||
|     # wikidiff2 appears to follow this same convention, but this behavior |  | ||||||
|     # is not documented. |  | ||||||
| 
 | 
 | ||||||
|     for entry in d['diff']: |         # the code below is designed to work in bytes because that's how wikidiff2 indexes | ||||||
|         from_start_line = entry['offset']['from'] |         self.from_bytes = from_text.encode("utf-8") | ||||||
|         to_start_line = entry['offset']['to'] |         self.to_bytes = to_text.encode("utf-8") | ||||||
|         # Per above, to_start_line appears to be nondecreasing, but |  | ||||||
|         # from_start_line may sometimes decrease for detected paragraph moves. |  | ||||||
| 
 |  | ||||||
|         from_start_tokens = len(tokenizer.tokenize(previous_text[:from_start_line])) |  | ||||||
|         to_start_tokens = len(tokenizer.tokenize(next_text[:to_start_line])) |  | ||||||
|         # These constant calls to tokenizer.tokenize can definitely be optimized |  | ||||||
|         # as tokenization is currently a bottleneck. Ideally tokenization would |  | ||||||
|         # happen incrementally where possible, or somehow be cached, but this |  | ||||||
|         # would be more complex. |  | ||||||
| 
 |  | ||||||
|         if entry['type'] == 0: |  | ||||||
|             # wikidiff2 doesn't appear to emit diffs of this type, but cover anyway. |  | ||||||
|             line_tokens = len(tokenizer.tokenize(entry['text'])) |  | ||||||
|             from_end_tokens = from_start_tokens + line_tokens |  | ||||||
|             to_end_tokens = to_start_tokens + line_tokens |  | ||||||
| 
 |  | ||||||
|             result.append(Equal(from_start_tokens, from_end_tokens, |  | ||||||
|                                 to_start_tokens, to_end_tokens)) |  | ||||||
| 
 |  | ||||||
|             from_last_end = from_end_tokens |  | ||||||
|             to_last_end  = to_end_tokens |  | ||||||
| 
 |  | ||||||
|             continue |  | ||||||
|         else: |  | ||||||
|             # These do not appear to be generated by wikidiff2, and so must be |  | ||||||
|             # inferred. |  | ||||||
|             equal_tokens = to_start_tokens - to_last_end |  | ||||||
|             # If we notice that the next non-zero segment (which must be a |  | ||||||
|             # change, given that its type is non-zero), begins after the end |  | ||||||
|             # of the previous segment, we must add an Equal segment. |  | ||||||
|             # TODO: While the "to" token ranges are correct, the "from" |  | ||||||
|             #  ranges are likely not, particularly in histories with paragraph |  | ||||||
|             #  moves. |  | ||||||
|             if equal_tokens > 0: |  | ||||||
|                 result.append(Equal(from_last_end, from_start_line, |  | ||||||
|                                     to_last_end, to_start_line)) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|         if entry['type'] == 1 or entry['type'] == 4: |         self.from_last_end_bytes = 0 | ||||||
|             # TODO: Separate out type 4 to recognize this is the insertion |         self.from_last_to_bytes = 0 | ||||||
|             #  part of a paragraph move. Note that for paragraph moves |         self.n_from_start_tokens = 0 | ||||||
|             #  the text is not necessarily identical, just similar. |         self.n_to_start_tokens = 0 | ||||||
|             line_tokens = len(tokenizer.tokenize(entry['text'])) |         self.last_to_start_line = 0 | ||||||
|             to_end_tokens = to_start_tokens + line_tokens |         self.last_from_start_line = 0 | ||||||
|  |         self.from_last_end_bytes = 0 | ||||||
|  |         self.to_last_end_bytes = 0 | ||||||
|          |          | ||||||
|             result.append(Insert(from_start_tokens, from_start_tokens, |     def tokenize(self, bytes): | ||||||
|                                  to_start_tokens, to_end_tokens, |         return self.tokenizer.tokenize(bytes.decode("utf-8")) | ||||||
|                                  )) |  | ||||||
| 
 | 
 | ||||||
|             # We have now used more of the "to" tokens. |     def to_operations(self): | ||||||
|             to_last_end = to_end_tokens |         parmove_from_dict = {}  # lookup move diffs based on moveinfo id. | ||||||
|         elif entry['type'] == 2 or entry['type'] == 5: |         parmove_to_dict = {} | ||||||
|             # TODO: Separate out type 5 to recognize this is the deletion |         for entry in self.diff["diff"]: | ||||||
|             #  part of a paragraph move. Note that for paragraph moves |             offset = entry['offset'] | ||||||
|             #  the text is not necessarily identical, just similar. |             linebytes = entry["text"].encode("utf-8") | ||||||
|             line_tokens = len(tokenizer.tokenize(entry['text'])) |  | ||||||
|             from_end_tokens = from_start_tokens + line_tokens |  | ||||||
| 
 | 
 | ||||||
|             result.append(Delete(from_start_tokens, from_end_tokens, |             # ignore empty diffs. They don't have any tokens | ||||||
|                                  to_start_tokens, to_start_tokens, |             if len(linebytes) == 0: | ||||||
|                                  )) |                 continue | ||||||
|  |             # this is the first byte of the line in the 'from' revision. | ||||||
|  |             from_start_line = entry["offset"]["from"] | ||||||
|  |             # this is the first byte of the line in the 'to' revision. | ||||||
|  |             to_start_line = entry["offset"]["to"] | ||||||
| 
 | 
 | ||||||
|             # We have not used more of the "from" tokens. |             if entry["type"] == 0: | ||||||
|             from_last_end = from_end_tokens |                 yield from self.doEqual(linebytes, offset) | ||||||
|         elif entry['type'] == 3: |  | ||||||
|             # The text field is an overlapping mix of both the previous and next |  | ||||||
|             # lines, and so we can't directly tokenize it. |  | ||||||
|              |              | ||||||
|             text = entry['text'] |                 # a line included in the 'to' revision, but not in the 'from' revision | ||||||
|  |             elif entry["type"] == 1: | ||||||
|  |                 yield from self.doInsert(linebytes, offset) | ||||||
| 
 | 
 | ||||||
|             last_end = 0 |                 # a line included in the 'from' revision, but not in the 'to' revision | ||||||
|             previous_line = "" |             elif entry["type"] == 2: | ||||||
|             next_line = "" |                 yield from self.doDelete(linebytes, offset) | ||||||
|      |      | ||||||
|             # A line will have one or more highlightRanges. |             elif entry["type"] == 3: | ||||||
|             # It is not guaranteed that insertions/deletions are matched, |                 yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset) | ||||||
|             # for instance, if a word is deleted from the middle of a line. |  | ||||||
|             for highlightRange in entry['highlightRanges']: |  | ||||||
|                 if highlightRange['start'] > last_end: |  | ||||||
|                     previous_line += text[last_end:highlightRange['start']] |  | ||||||
|                     next_line += text[last_end:highlightRange['start']] |  | ||||||
|                     # Add an Equal segment. |  | ||||||
|              |              | ||||||
|                 rangeStart = highlightRange['start'] |             elif entry["type"] == 4: | ||||||
|                 rangeEnd = rangeStart + highlightRange['length'] |                 parmove_from_dict["moveInfo"]["id"] = diff | ||||||
| 
 | 
 | ||||||
|                 if highlightRange['type'] == 0: |             elif entry["type"] == 5: | ||||||
|                     # Insertion |                 # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. | ||||||
|                     next_line += text[rangeStart:rangeEnd] |                 parmove_to_dict["moveInfo"]["id"] = diff | ||||||
|  |             # for deletions and equality report the token indexes from the 'from' revision. | ||||||
| 
 | 
 | ||||||
|                     # Add an Insert segment. |             else: | ||||||
|                 elif highlightRange['type'] == 1: |                 # The 'type' isn't one of the known | ||||||
|                     # Deletion |                 raise ValueError(d) | ||||||
|                     previous_line += text[rangeStart:rangeEnd] |  | ||||||
| 
 | 
 | ||||||
|                     # Add a Delete segment. |  | ||||||
|                 else: |  | ||||||
|                     raise Exception(entry) |  | ||||||
|          |          | ||||||
|             from_tokens = len(tokenizer.tokenize(previous_line)) |         # mwpersistence expects differences to be represented in order from the | ||||||
|             to_tokens = len(tokenizer.tokenize(next_line)) |         # result's perspective ("to"), not the previous text. Thus, if a line | ||||||
|  |         # is moved earlier then its insertion should appear before its deletion. | ||||||
|  |         # As a rule of thumb, the "to" segments should be non-overlapping and | ||||||
|  |         # strictly increasing, while the "from" segments should merely be | ||||||
|  |         # non-overlapping. | ||||||
| 
 | 
 | ||||||
|             from_start_tokens += from_tokens |         # now we go through the parmoves | ||||||
|             to_start_tokens += to_tokens |         for id, from_diff in parmove_from_dict.items(): | ||||||
|         else: |             to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]] | ||||||
|             # The 'type' isn't one of the known |  | ||||||
|             raise ValueError(d) |  | ||||||
|              |              | ||||||
|     # TODO: Handle trailing tokens |  | ||||||
| 
 | 
 | ||||||
|     # raise Exception(result) |     def doEqual(self, equal_bytes, offset): | ||||||
|     return result |         tokens = self.tokenize(equal_bytes) | ||||||
|  |         n_tokens = len(tokens) | ||||||
|  |         self.n_from_end_tokens = self.n_from_start_tokens + n_tokens | ||||||
|  |         self.n_to_end_tokens = self.n_to_start_tokens + n_tokens | ||||||
|  |         yield ( | ||||||
|  |             Equal( | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_from_end_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |                 self.n_to_end_tokens, | ||||||
|  |             ), | ||||||
|  |             tokens, | ||||||
|  |             tokens, | ||||||
|  |         ) | ||||||
|  |         # we need to keep track of the to and from last end bytes | ||||||
|  |         self.from_last_end_bytes = offset["from"] + len(equal_bytes) | ||||||
|  |         self.to_last_end_bytes = offset["to"] + len(equal_bytes) | ||||||
|  |         self.n_from_start_tokens += n_tokens | ||||||
|  |         self.n_to_start_tokens += n_tokens | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     def doInsert(self, insert_bytes, offset): | ||||||
|  |         tokens = self.tokenize(insert_bytes) | ||||||
|  |         n_tokens = len(tokens) | ||||||
|  |         self.n_to_end_tokens = self.n_to_start_tokens + n_tokens | ||||||
|  |         yield ( | ||||||
|  |             Insert( | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |                 self.n_to_end_tokens, | ||||||
|  |             ), | ||||||
|  |             [], | ||||||
|  |             tokens, | ||||||
|  |         ) | ||||||
|  |         # We have now used more of the "to" tokens. | ||||||
|  |         self.n_to_start_tokens += n_tokens | ||||||
|  |         self.to_last_end_bytes = offset["to"] + len(insert_bytes) | ||||||
|  | 
 | ||||||
|  |     def doDelete(self, delete_bytes, offset): | ||||||
|  |         tokens = self.tokenize(delete_bytes) | ||||||
|  |         n_tokens = len(tokens) | ||||||
|  |         self.n_from_end_tokens = self.n_from_start_tokens + n_tokens | ||||||
|  |         yield ( | ||||||
|  |             Delete( | ||||||
|  |                 self.n_from_start_tokens, | ||||||
|  |                 self.n_from_end_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |                 self.n_to_start_tokens, | ||||||
|  |             ), | ||||||
|  |             tokens, | ||||||
|  |             [], | ||||||
|  |         ) | ||||||
|  |         # We have now used more of the "from" tokens. | ||||||
|  |         self.n_from_start_tokens += n_tokens | ||||||
|  |         self.from_last_end_bytes = offset["from"] + len(delete_bytes) | ||||||
|  | 
 | ||||||
|  |     def doHighlightRange(self, highlight_bytes, highlightRanges, offset): | ||||||
|  |         # The text field is an overlapping mix of both the from and to, | ||||||
|  |         # so we need to handle it highlight-by-highlight. | ||||||
|  |         # there can be gaps between highlight segments. | ||||||
|  |         # for instance, if a word is deleted from the middle of a line. | ||||||
|  |         # we need to track that. | ||||||
|  |         highlight_end = 0 | ||||||
|  |         highlight_offset = offset | ||||||
|  |         # note that diffs are token-level, but the indexes are byte-level | ||||||
|  | 
 | ||||||
|  |         for highlightRange in highlightRanges: | ||||||
|  |             highlight_start = highlightRange["start"] | ||||||
|  |             # equal bytes in between highlights | ||||||
|  |             if highlight_start > highlight_end: | ||||||
|  | 
 | ||||||
|  |                 equal_bytes = highlight_bytes[ | ||||||
|  |                     highlight_end : highlight_start | ||||||
|  |                 ] | ||||||
|  |                 n_equal_bytes = len(equal_bytes) | ||||||
|  |                 yield from self.doEqual(equal_bytes, highlight_offset) | ||||||
|  |                 highlight_offset['from'] += n_equal_bytes | ||||||
|  |                 highlight_offset['to'] += n_equal_bytes | ||||||
|  | 
 | ||||||
|  |             # handle highlighted insert / delete | ||||||
|  |             highlight_end = highlight_start + highlightRange["length"] | ||||||
|  |             range_bytes = highlight_bytes[highlight_start:highlight_end] | ||||||
|  |             n_range_bytes = len(range_bytes) | ||||||
|  |             if highlightRange["type"] == 0: | ||||||
|  |                 yield from self.doInsert(range_bytes, highlight_offset) | ||||||
|  |                 highlight_offset['to'] += n_range_bytes | ||||||
|  |             elif highlightRange["type"] == 1: | ||||||
|  |                 yield from self.doDelete(range_bytes, highlight_offset) | ||||||
|  |                 highlight_offset['from'] += n_range_bytes | ||||||
|  |             else: | ||||||
|  |                 raise Exception(entry) | ||||||
|  | 
 | ||||||
|  |         # handle the rest of the line which is equal | ||||||
|  |         if highlight_end < len(highlight_bytes): | ||||||
|  |             range_bytes = highlight_bytes[highlight_end:] | ||||||
|  |             yield from self.doEqual(range_bytes, highlight_offset) | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class WikiDiffMatcher: | class WikiDiffMatcher: | ||||||
|     def __init__(self, |     def __init__( | ||||||
|                  url: str, |         self, | ||||||
|                  texts: list[str], |         texts: list[str] = None, | ||||||
|                  tokenizer: RegexTokenizer = None, |         tokenizer: Optional[RegexTokenizer] = None, | ||||||
|                  ): |         url: Optional[str] = "http://127.0.0.1:8000", | ||||||
|  |     ): | ||||||
|         # Pre-compute diffs to reduce traffic overhead. |         # Pre-compute diffs to reduce traffic overhead. | ||||||
|         self.diffs = compute_diffs(url, texts) |         self.diffs = compute_diffs(url, texts) | ||||||
|         self.tokenizer = tokenizer or TOKENIZER |         self.tokenizer = tokenizer or TOKENIZER | ||||||
| 
 | 
 | ||||||
|     class Processor(DiffEngine.Processor): |     class Processor(DiffEngine.Processor): | ||||||
|         def __init__(self, |         def __init__(self, texts, tokenizer=None): | ||||||
|                      diffs, |             self.diffs = iter(texts) | ||||||
|                      tokenizer=None |  | ||||||
|                      ): |  | ||||||
|             self.diffs = iter(diffs) |  | ||||||
|             self.tokenizer = tokenizer or TOKENIZER |             self.tokenizer = tokenizer or TOKENIZER | ||||||
|             self.last_tokens = [] |             self.last_tokens = [] | ||||||
|             self.previous_text = "" |             self.previous_text = "" | ||||||
| @ -196,28 +246,27 @@ class WikiDiffMatcher: | |||||||
|             self.last_tokens = last_tokens |             self.last_tokens = last_tokens | ||||||
| 
 | 
 | ||||||
|         def process(self, text, token_class=None): |         def process(self, text, token_class=None): | ||||||
|             # IDEs will report the method signature as incorrect, but this is |  | ||||||
|             # expected. The DiffEngine.Processor class must be inherited from, |  | ||||||
|             # and its process definition incorrectly excludes a "self" argument. |  | ||||||
| 
 |  | ||||||
|             # The diff has already been computed, but we need to incrementally |             # The diff has already been computed, but we need to incrementally | ||||||
|             # retrieve it to recreate the behavior DiffState expects. |             # retrieve it to recreate the behavior DiffState expects. | ||||||
|             diff = next(self.diffs) |             diff = next(self.diffs) | ||||||
|  |             diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer) | ||||||
|  |             ( | ||||||
|  |                 operations, | ||||||
|  |                 aseq, | ||||||
|  |                 bseq, | ||||||
|  |             ) = list( | ||||||
|  |                 zip(*diffToOperationsMapper.to_operations()) | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|             tokens = self.tokenizer.tokenize(text, token_class=token_class) |             self.last_tokens = list(chain.from_iterable(aseq)) | ||||||
|             operations = to_operations(self.previous_text, text, diff, self.tokenizer) |             tokens = list(chain.from_iterable(bseq)) | ||||||
| 
 |  | ||||||
|             a = self.last_tokens |  | ||||||
|             b = tokens |  | ||||||
|             self.last_tokens = tokens |  | ||||||
|             self.previous_text = text |             self.previous_text = text | ||||||
| 
 | 
 | ||||||
|             return operations, a, b |             return operations, self.last_tokens, tokens | ||||||
| 
 | 
 | ||||||
|     def processor(self, *args, **kwargs): |     def processor(self, *args, **kwargs): | ||||||
|         return self.Processor(self.diffs, self.tokenizer) |         return self.Processor(self.diffs, self.tokenizer) | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|     def process(self): |     def process(self): | ||||||
|         # DiffState checks for this method even though it is not called. |         # DiffState checks for this method even though it is not called. | ||||||
|         raise Exception("Unnecessary implementation") |         raise Exception("Unnecessary implementation") | ||||||
|  | |||||||
| @ -17,7 +17,7 @@ $data = json_decode($rawData, true); | |||||||
| $previous = ''; | $previous = ''; | ||||||
| $result = []; | $result = []; | ||||||
| foreach ($data as $i => $value) { | foreach ($data as $i => $value) { | ||||||
|     $result[] = wikidiff2_inline_json_diff($previous, $value, 0); |     $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000); | ||||||
|     $previous = $value; |     $previous = $value; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
							
								
								
									
										9
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								wikiq
									
									
									
									
									
								
							| @ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will | |||||||
| If the pattern does not include a capture group, then only one output column will result. | If the pattern does not include a capture group, then only one output column will result. | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| class RegexPair(object): | class RegexPair(object): | ||||||
|     def __init__(self, pattern, label): |     def __init__(self, pattern, label): | ||||||
|         self.pattern = re.compile(pattern) |         self.pattern = re.compile(pattern) | ||||||
| @ -219,7 +218,7 @@ class WikiqParser: | |||||||
|                  revert_radius: int = 15, |                  revert_radius: int = 15, | ||||||
|                  output_parquet: bool = True, |                  output_parquet: bool = True, | ||||||
|                  parquet_buffer_size: int = 2000, |                  parquet_buffer_size: int = 2000, | ||||||
|                  wikidiff_url: str = "", |                  wikidiff_url: str = "http://127.0.0.1:8000", | ||||||
|                  ): |                  ): | ||||||
| 
 | 
 | ||||||
|         """  |         """  | ||||||
| @ -450,9 +449,9 @@ class WikiqParser: | |||||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), |                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), | ||||||
|                                                     revert_radius=PERSISTENCE_RADIUS) |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 elif self.persist == PersistMethod.wikidiff: |                 elif self.persist == PersistMethod.wikidiff: | ||||||
|                     state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url, |                     state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, | ||||||
|                                                                     revision_texts, |                                                                     tokenizer=wikitext_split, | ||||||
|                                                                     tokenizer=wikitext_split), |                                                                     self.wikidiff_url), | ||||||
|                                                     revert_radius=PERSISTENCE_RADIUS) |                                                     revert_radius=PERSISTENCE_RADIUS) | ||||||
|                 else: |                 else: | ||||||
|                     from mw.lib import persistence |                     from mw.lib import persistence | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user