got wikidiff2 persistence working except for paragraph moves.
This commit is contained in:
		
							parent
							
								
									186cb82fb8
								
							
						
					
					
						commit
						5a3e4102b5
					
				
							
								
								
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitmodules
									
									
									
									
										vendored
									
									
								
							| @ -0,0 +1,3 @@ | ||||
| [submodule "mediawiki-php-wikidiff2"] | ||||
| 	path = mediawiki-php-wikidiff2 | ||||
| 	url = https://github.com/wikimedia/mediawiki-php-wikidiff2/ | ||||
| @ -20,6 +20,13 @@ associated tests to work. | ||||
| - 7zip | ||||
| - ffmpeg | ||||
| 
 | ||||
| A new diff engine based on `_wikidiff2` can be used for word-persistence. Wikiq can also output the diffs between each page revision. This requires installing Wikidiff 2 on your system. On Debian or Ubuntu Linux this can be done via. | ||||
| 
 | ||||
| ``apt-get install php-wikidiff2`` | ||||
| 
 | ||||
| You may have to also run. | ||||
| ``sudo phpenmod wikidiff2``. | ||||
| 
 | ||||
| Tests | ||||
| ---- | ||||
| To run tests:: | ||||
| @ -30,3 +37,5 @@ TODO: | ||||
| _______________ | ||||
| 1. [] Output metadata about the run. What parameters were used? What versions of deltas? | ||||
| 2. [] Url encoding by default | ||||
| 
 | ||||
| .. _wikidiff2: https://www.mediawiki.org/wiki/Wikidiff2 | ||||
|  | ||||
| @ -3,7 +3,7 @@ name = "mediawiki-dump-tools" | ||||
| version = "0.1.0" | ||||
| description = "Add your description here" | ||||
| readme = "README.md" | ||||
| requires-python = "~=3.9" | ||||
| requires-python = ">=3.9" | ||||
| dependencies = [ | ||||
|     "deltas>=0.7.0", | ||||
|     "mediawiki-utilities>=0.4.18", | ||||
| @ -18,8 +18,11 @@ dependencies = [ | ||||
| [tool.uv.sources] | ||||
| yamlconf = { git = "https://github.com/groceryheist/yamlconf" } | ||||
| mwxml = { git = "https://github.com/groceryheist/python-mwxml" } | ||||
| deltas = { git = "https://github.com/groceryheist/deltas" } | ||||
| 
 | ||||
| [dependency-groups] | ||||
| dev = [ | ||||
|     "pandas>=2.1.0" | ||||
|     "pandas>=2.1.0", | ||||
|     "pytest>=8.4.1", | ||||
|     "pytest-asyncio>=1.0.0", | ||||
| ] | ||||
|  | ||||
| @ -1,8 +1,11 @@ | ||||
| import json | ||||
| import sys | ||||
| from itertools import chain | ||||
| from typing import Generator, List, Optional, Tuple | ||||
| 
 | ||||
| import requests | ||||
| from deltas import tokenizers, RegexTokenizer, DiffEngine, Equal, Insert, Delete | ||||
| from deltas import (Delete, DiffEngine, Equal, Insert, Operation, | ||||
|                     RegexTokenizer, Token, tokenizers) | ||||
| 
 | ||||
| TOKENIZER = tokenizers.wikitext_split | ||||
| 
 | ||||
| @ -15,7 +18,8 @@ def compute_diffs(url: str, texts: list[str]) -> list: | ||||
|         incremental_diffs = response.json() | ||||
|     except requests.exceptions.ConnectionError as e: | ||||
|         print( | ||||
|             f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running.") | ||||
|             f"Connection Error: Could not connect to the server at {url}. Make sure your local server is running." | ||||
|         ) | ||||
|         print(e) | ||||
|         raise e | ||||
|     except requests.exceptions.HTTPError as e: | ||||
| @ -34,193 +38,206 @@ def compute_diffs(url: str, texts: list[str]) -> list: | ||||
|         print(f"An unexpected error occurred: {e}") | ||||
|         raise e | ||||
| 
 | ||||
|     # for diff in incremental_diffs: | ||||
|     #     for wikidiffop in json.loads(diff)["diff"][0:5]: | ||||
|     #         print(wikidiffop) | ||||
| 
 | ||||
|     return incremental_diffs | ||||
| 
 | ||||
| 
 | ||||
| def to_operations(from_text:str, to_text:str, diff:str, tokenizer: RegexTokenizer) -> list: | ||||
|     d = json.loads(diff) | ||||
| class DiffToOperationMap: | ||||
| 
 | ||||
|     def __init__(self, from_text, to_text, diff, tokenizer): | ||||
| 
 | ||||
|         self.diff = diff | ||||
|         self.tokenizer = tokenizer | ||||
|         self.diff = json.loads(diff) | ||||
| 
 | ||||
|         # the code below is designed to work in bytes because that's how wikidiff2 indexes | ||||
|     from_text = from_text.encode('utf-8') | ||||
|     to_text = to_text.encode('utf-8') | ||||
|         self.from_bytes = from_text.encode("utf-8") | ||||
|         self.to_bytes = to_text.encode("utf-8") | ||||
| 
 | ||||
|     # convinient function for tokenizing bytes | ||||
|     def tokenize(bytes): | ||||
|         return tokenizer.tokenize(bytes.decode('utf-8')) | ||||
| 
 | ||||
|     # Keep track of the last difference we saw in order to notice unaccounted-for | ||||
|     # tokens. Each token at the end of "to" which is skipped for the next diff | ||||
|     # must be represented as an "Equal()" segment. | ||||
|     from_last_end_bytes = 0 | ||||
|     to_last_end_bytes = 0 | ||||
|         self.from_last_end_bytes = 0 | ||||
|         self.from_last_to_bytes = 0 | ||||
|         self.n_from_start_tokens = 0 | ||||
|         self.n_to_start_tokens = 0 | ||||
|         self.last_to_start_line = 0 | ||||
|         self.last_from_start_line = 0 | ||||
|         self.from_last_end_bytes = 0 | ||||
|         self.to_last_end_bytes = 0 | ||||
|          | ||||
|     result = [] | ||||
|     # DiffState expects differences to be represented in order from the | ||||
|     def tokenize(self, bytes): | ||||
|         return self.tokenizer.tokenize(bytes.decode("utf-8")) | ||||
| 
 | ||||
|     def to_operations(self): | ||||
|         parmove_from_dict = {}  # lookup move diffs based on moveinfo id. | ||||
|         parmove_to_dict = {} | ||||
|         for entry in self.diff["diff"]: | ||||
|             offset = entry['offset'] | ||||
|             linebytes = entry["text"].encode("utf-8") | ||||
| 
 | ||||
|             # ignore empty diffs. They don't have any tokens | ||||
|             if len(linebytes) == 0: | ||||
|                 continue | ||||
|             # this is the first byte of the line in the 'from' revision. | ||||
|             from_start_line = entry["offset"]["from"] | ||||
|             # this is the first byte of the line in the 'to' revision. | ||||
|             to_start_line = entry["offset"]["to"] | ||||
| 
 | ||||
|             if entry["type"] == 0: | ||||
|                 yield from self.doEqual(linebytes, offset) | ||||
|              | ||||
|                 # a line included in the 'to' revision, but not in the 'from' revision | ||||
|             elif entry["type"] == 1: | ||||
|                 yield from self.doInsert(linebytes, offset) | ||||
| 
 | ||||
|                 # a line included in the 'from' revision, but not in the 'to' revision | ||||
|             elif entry["type"] == 2: | ||||
|                 yield from self.doDelete(linebytes, offset) | ||||
|      | ||||
|             elif entry["type"] == 3: | ||||
|                 yield from self.doHighlightRange(linebytes, entry['highlightRanges'], offset) | ||||
|              | ||||
|             elif entry["type"] == 4: | ||||
|                 parmove_from_dict["moveInfo"]["id"] = diff | ||||
| 
 | ||||
|             elif entry["type"] == 5: | ||||
|                 # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. | ||||
|                 parmove_to_dict["moveInfo"]["id"] = diff | ||||
|             # for deletions and equality report the token indexes from the 'from' revision. | ||||
| 
 | ||||
|             else: | ||||
|                 # The 'type' isn't one of the known | ||||
|                 raise ValueError(d) | ||||
| 
 | ||||
|          | ||||
|         # mwpersistence expects differences to be represented in order from the | ||||
|         # result's perspective ("to"), not the previous text. Thus, if a line | ||||
|         # is moved earlier then its insertion should appear before its deletion. | ||||
|         # As a rule of thumb, the "to" segments should be non-overlapping and | ||||
|         # strictly increasing, while the "from" segments should merely be | ||||
|         # non-overlapping. | ||||
| 
 | ||||
|     # wikidiff2 appears to follow this same convention, but this behavior | ||||
|     # is not documented. | ||||
|         # now we go through the parmoves | ||||
|         for id, from_diff in parmove_from_dict.items(): | ||||
|             to_diff = parmove_from_dict[from_diff["moveInfo"]["linkId"]] | ||||
|              | ||||
|     # Note that, confusingly for Insert operations only the "to" indexes matter | ||||
|     # and for the Delete and Equal operations only the "from" indexes matter. | ||||
|     # This is clear from reading state.py in `mwpersistence` and operations.py in `deltas` | ||||
| 
 | ||||
|     parmove_from_dict = {} # lookup move diffs based on moveinfo id. | ||||
|     parmove_to_dict = {}  | ||||
|      | ||||
|     for entry in d['diff']: | ||||
|         linebytes = entry['text'].encode('utf-8') | ||||
|         from_start_line = entry['offset']['from'] # this is the first byte of the line in the 'from' revision. | ||||
|         to_start_line = entry['offset']['to'] # this is the first byte of the line in the 'to' revision. | ||||
| 
 | ||||
|         from_start_tokens = len(tokenize(from_text[:from_start_line])) | ||||
|         to_start_tokens = len(tokenize(to_text[:to_start_line])) | ||||
|         # These constant calls to tokenizer.tokenize can definitely be optimized | ||||
|         # as tokenization is currently a bottleneck. Ideally tokenization would | ||||
|         # happen incrementally where possible, or somehow be cached, but this | ||||
|         # would be more complex. N: I think it's okay. CPU is cheap. | ||||
| 
 | ||||
|         if entry['type'] == 0: # wikidiff2 can emit this if it is called with numContextLines != 0. | ||||
| 
 | ||||
|             line_tokens = len(tokenize(linebytes)) | ||||
|             from_end_tokens = from_start_tokens + line_tokens | ||||
|             to_end_tokens = to_start_tokens + line_tokens | ||||
| 
 | ||||
|             result.append(Equal(from_start_tokens, from_end_tokens, | ||||
|                                 to_start_tokens, to_end_tokens)) | ||||
| 
 | ||||
|     def doEqual(self, equal_bytes, offset): | ||||
|         tokens = self.tokenize(equal_bytes) | ||||
|         n_tokens = len(tokens) | ||||
|         self.n_from_end_tokens = self.n_from_start_tokens + n_tokens | ||||
|         self.n_to_end_tokens = self.n_to_start_tokens + n_tokens | ||||
|         yield ( | ||||
|             Equal( | ||||
|                 self.n_from_start_tokens, | ||||
|                 self.n_from_end_tokens, | ||||
|                 self.n_to_start_tokens, | ||||
|                 self.n_to_end_tokens, | ||||
|             ), | ||||
|             tokens, | ||||
|             tokens, | ||||
|         ) | ||||
|         # we need to keep track of the to and from last end bytes | ||||
|             from_last_end_bytes += len(linebytes) | ||||
|             to_last_end_bytes  += len(linebytes) | ||||
|         self.from_last_end_bytes = offset["from"] + len(equal_bytes) | ||||
|         self.to_last_end_bytes = offset["to"] + len(equal_bytes) | ||||
|         self.n_from_start_tokens += n_tokens | ||||
|         self.n_to_start_tokens += n_tokens | ||||
| 
 | ||||
|             continue | ||||
|         else: | ||||
|             # These do not appear to be generated by wikidiff2, and so must be | ||||
|             # inferred. | ||||
|             equal_tokens = to_start_tokens - to_last_end_bytes | ||||
|             # If we notice that the next non-zero segment (which must be a | ||||
|             # change, given that its type is non-zero), begins after the end | ||||
|             # of the previous segment, we must add an Equal segment. | ||||
|             # TODO: While the "to" token ranges are correct, | ||||
|             # the "from" | ||||
|             #  ranges are likely not, particularly in histories with paragraph | ||||
|             #  moves. they can be corrected. | ||||
|             if equal_tokens > 0: | ||||
|                 # only the 'from' indexes matter | ||||
|                 result.append(Equal(from_last_end_bytes, from_start_line, | ||||
|                                     to_last_end_bytes, to_start_line)) | ||||
| 
 | ||||
|         if entry['type'] == 1: # a line included in the 'to' revision, but not in the 'from' revision | ||||
|             line_tokens = len(tokenize(linebytes)) | ||||
|             to_end_tokens = to_start_tokens + line_tokens | ||||
| 
 | ||||
|             result.append(Insert(from_start_tokens, from_start_tokens, | ||||
|                                  to_start_tokens, to_end_tokens, | ||||
|                                  )) | ||||
| 
 | ||||
|     def doInsert(self, insert_bytes, offset): | ||||
|         tokens = self.tokenize(insert_bytes) | ||||
|         n_tokens = len(tokens) | ||||
|         self.n_to_end_tokens = self.n_to_start_tokens + n_tokens | ||||
|         yield ( | ||||
|             Insert( | ||||
|                 self.n_from_start_tokens, | ||||
|                 self.n_from_start_tokens, | ||||
|                 self.n_to_start_tokens, | ||||
|                 self.n_to_end_tokens, | ||||
|             ), | ||||
|             [], | ||||
|             tokens, | ||||
|         ) | ||||
|         # We have now used more of the "to" tokens. | ||||
|             to_start_end = to_end_tokens | ||||
|              | ||||
|         elif entry['type'] == 2: # a line included in the 'from' revision, but not in the 'to' revision | ||||
|             line_tokens = len(tokenize(linebytes)) | ||||
|             from_end_tokens = from_start_tokens + line_tokens | ||||
| 
 | ||||
|             result.append(Delete(from_start_tokens, from_end_tokens, | ||||
|                                  to_start_tokens, to_start_tokens, | ||||
|                                  )) | ||||
|         self.n_to_start_tokens += n_tokens | ||||
|         self.to_last_end_bytes = offset["to"] + len(insert_bytes) | ||||
| 
 | ||||
|     def doDelete(self, delete_bytes, offset): | ||||
|         tokens = self.tokenize(delete_bytes) | ||||
|         n_tokens = len(tokens) | ||||
|         self.n_from_end_tokens = self.n_from_start_tokens + n_tokens | ||||
|         yield ( | ||||
|             Delete( | ||||
|                 self.n_from_start_tokens, | ||||
|                 self.n_from_end_tokens, | ||||
|                 self.n_to_start_tokens, | ||||
|                 self.n_to_start_tokens, | ||||
|             ), | ||||
|             tokens, | ||||
|             [], | ||||
|         ) | ||||
|         # We have now used more of the "from" tokens. | ||||
|             from_last_end_bytes = from_end_tokens | ||||
|         self.n_from_start_tokens += n_tokens | ||||
|         self.from_last_end_bytes = offset["from"] + len(delete_bytes) | ||||
| 
 | ||||
|         elif entry['type'] == 3: | ||||
|     def doHighlightRange(self, highlight_bytes, highlightRanges, offset): | ||||
|         # The text field is an overlapping mix of both the from and to, | ||||
|         # so we need to handle it highlight-by-highlight. | ||||
|         # there can be gaps between highlight segments. | ||||
|         # for instance, if a word is deleted from the middle of a line. | ||||
|         # we need to track that. | ||||
|             highlight_last_end = 0 | ||||
| 
 | ||||
|         highlight_end = 0 | ||||
|         highlight_offset = offset | ||||
|         # note that diffs are token-level, but the indexes are byte-level | ||||
|             for highlightRange in entry['highlightRanges']: | ||||
|                 if highlightRange['start'] > highlight_last_end: | ||||
| 
 | ||||
|                     equal_bytes = linebytes[highlight_last_end:highlightRange['start']] | ||||
|                     equal_tokens = len(tokenize(equal_bytes)) | ||||
|                     from_end_tokens = from_start_tokens + equal_tokens | ||||
|                     to_end_tokens = to_end_tokens + equal_tokens | ||||
|                     result.append(Equal(from_start_tokens, from_end_tokens, | ||||
|                                         to_start_tokens, to_end_tokens | ||||
|                                         )) | ||||
|         for highlightRange in highlightRanges: | ||||
|             highlight_start = highlightRange["start"] | ||||
|             # equal bytes in between highlights | ||||
|             if highlight_start > highlight_end: | ||||
| 
 | ||||
|                     from_start_tokens = from_end_tokens | ||||
|                     to_start_tokens = to_end_tokens | ||||
|                      | ||||
|                 rangeStart = highlightRange['start'] | ||||
|                 rangeEnd = rangeStart + highlightRange['length'] | ||||
|                 range_bytes = linebytes[rangeStart:rangeEnd] | ||||
|                 range_tokens = len(tokenize(range_bytes)) | ||||
|                 if highlightRange['type'] == 0: | ||||
|                     # Insertion | ||||
|                     to_end_tokens = to_start_tokens + range_tokens | ||||
|                     result.append(Insert(from_start_tokens, from_end_tokens, | ||||
|                                          to_start_tokens, to_end_tokens)) | ||||
| 
 | ||||
|                     to_start_tokens = to_end_tokens | ||||
|                 elif highlightRange['type'] == 1: | ||||
|                     # Deletion | ||||
|                     from_end_tokens = from_start_tokens + range_tokens | ||||
|                     result.append(Delete(from_start_tokens, from_end_tokens, | ||||
|                                          to_start_tokens, to_end_tokens)) | ||||
|                     from_start_tokens = from_end_tokens | ||||
|                 equal_bytes = highlight_bytes[ | ||||
|                     highlight_end : highlight_start | ||||
|                 ] | ||||
|                 n_equal_bytes = len(equal_bytes) | ||||
|                 yield from self.doEqual(equal_bytes, highlight_offset) | ||||
|                 highlight_offset['from'] += n_equal_bytes | ||||
|                 highlight_offset['to'] += n_equal_bytes | ||||
| 
 | ||||
|             # handle highlighted insert / delete | ||||
|             highlight_end = highlight_start + highlightRange["length"] | ||||
|             range_bytes = highlight_bytes[highlight_start:highlight_end] | ||||
|             n_range_bytes = len(range_bytes) | ||||
|             if highlightRange["type"] == 0: | ||||
|                 yield from self.doInsert(range_bytes, highlight_offset) | ||||
|                 highlight_offset['to'] += n_range_bytes | ||||
|             elif highlightRange["type"] == 1: | ||||
|                 yield from self.doDelete(range_bytes, highlight_offset) | ||||
|                 highlight_offset['from'] += n_range_bytes | ||||
|             else: | ||||
|                 raise Exception(entry) | ||||
| 
 | ||||
|                 highlight_last_end = highlightRange['start'] + highlightRange['length'] | ||||
|         # handle the rest of the line which is equal | ||||
|         if highlight_end < len(highlight_bytes): | ||||
|             range_bytes = highlight_bytes[highlight_end:] | ||||
|             yield from self.doEqual(range_bytes, highlight_offset) | ||||
| 
 | ||||
|         elif entry['type'] == 4: | ||||
| 
 | ||||
|             parmove_from_dict['moveInfo']['id'] = diff | ||||
| 
 | ||||
|         elif entry['type'] == 5: | ||||
| 
 | ||||
|             parmove_to_dict['moveInfo']['id'] = diff | ||||
|             # for type 4 diffs (paragraph moved in the from revision) we need to find a matching type 5 diff. | ||||
|             # for deletions and equality report the token indexes from the 'from' revision.  | ||||
|         else: | ||||
|             # The 'type' isn't one of the known | ||||
|             raise ValueError(d) | ||||
| 
 | ||||
|     # now we go through the parmoves | ||||
|     for id, from_diff in parmove_from_dict.items(): | ||||
|         to_diff = parmove_from_dict[from_diff['moveInfo']['linkId']] | ||||
|         ### TODO calculate the correct token indexes. | ||||
| 
 | ||||
|     # TODO: Handle trailing tokens | ||||
| 
 | ||||
|     # raise Exception(result) | ||||
|     return result | ||||
| 
 | ||||
| class WikiDiffMatcher: | ||||
|     def __init__(self, | ||||
|                  url: str, | ||||
|                  texts: list[str], | ||||
|                  tokenizer: RegexTokenizer = None, | ||||
|     def __init__( | ||||
|         self, | ||||
|         texts: list[str] = None, | ||||
|         tokenizer: Optional[RegexTokenizer] = None, | ||||
|         url: Optional[str] = "http://127.0.0.1:8000", | ||||
|     ): | ||||
|         # Pre-compute diffs to reduce traffic overhead. | ||||
|         self.diffs = compute_diffs(url, texts) | ||||
|         self.tokenizer = tokenizer or TOKENIZER | ||||
| 
 | ||||
|     class Processor(DiffEngine.Processor): | ||||
|         def __init__(self, | ||||
|                      diffs, | ||||
|                      tokenizer=None | ||||
|                      ): | ||||
|             self.diffs = iter(diffs) | ||||
|         def __init__(self, texts, tokenizer=None): | ||||
|             self.diffs = iter(texts) | ||||
|             self.tokenizer = tokenizer or TOKENIZER | ||||
|             self.last_tokens = [] | ||||
|             self.previous_text = "" | ||||
| @ -229,28 +246,27 @@ class WikiDiffMatcher: | ||||
|             self.last_tokens = last_tokens | ||||
| 
 | ||||
|         def process(self, text, token_class=None): | ||||
|             # IDEs will report the method signature as incorrect, but this is | ||||
|             # expected. The DiffEngine.Processor class must be inherited from, | ||||
|             # and its process definition incorrectly excludes a "self" argument. | ||||
| 
 | ||||
|             # The diff has already been computed, but we need to incrementally | ||||
|             # retrieve it to recreate the behavior DiffState expects. | ||||
|             diff = next(self.diffs) | ||||
|             diffToOperationsMapper = DiffToOperationMap(self.previous_text, text, diff, self.tokenizer) | ||||
|             ( | ||||
|                 operations, | ||||
|                 aseq, | ||||
|                 bseq, | ||||
|             ) = list( | ||||
|                 zip(*diffToOperationsMapper.to_operations()) | ||||
|             ) | ||||
| 
 | ||||
|             tokens = self.tokenizer.tokenize(text, token_class=token_class) | ||||
|             operations = to_operations(self.previous_text, text, diff, self.tokenizer) | ||||
| 
 | ||||
|             a = self.last_tokens | ||||
|             b = tokens | ||||
|             self.last_tokens = tokens | ||||
|             self.last_tokens = list(chain.from_iterable(aseq)) | ||||
|             tokens = list(chain.from_iterable(bseq)) | ||||
|             self.previous_text = text | ||||
| 
 | ||||
|             return operations, a, b | ||||
|             return operations, self.last_tokens, tokens | ||||
| 
 | ||||
|     def processor(self, *args, **kwargs): | ||||
|         return self.Processor(self.diffs, self.tokenizer) | ||||
| 
 | ||||
| 
 | ||||
|     def process(self): | ||||
|         # DiffState checks for this method even though it is not called. | ||||
|         raise Exception("Unnecessary implementation") | ||||
|  | ||||
| @ -17,7 +17,7 @@ $data = json_decode($rawData, true); | ||||
| $previous = ''; | ||||
| $result = []; | ||||
| foreach ($data as $i => $value) { | ||||
|     $result[] = wikidiff2_inline_json_diff($previous, $value, 0); | ||||
|     $result[] = wikidiff2_inline_json_diff($previous, $value, 5000000); | ||||
|     $previous = $value; | ||||
| } | ||||
| 
 | ||||
							
								
								
									
										9
									
								
								wikiq
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								wikiq
									
									
									
									
									
								
							| @ -140,7 +140,6 @@ The pattern can include capture groups.  If it does then each capture group will | ||||
| If the pattern does not include a capture group, then only one output column will result. | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| class RegexPair(object): | ||||
|     def __init__(self, pattern, label): | ||||
|         self.pattern = re.compile(pattern) | ||||
| @ -219,7 +218,7 @@ class WikiqParser: | ||||
|                  revert_radius: int = 15, | ||||
|                  output_parquet: bool = True, | ||||
|                  parquet_buffer_size: int = 2000, | ||||
|                  wikidiff_url: str = "", | ||||
|                  wikidiff_url: str = "http://127.0.0.1:8000", | ||||
|                  ): | ||||
| 
 | ||||
|         """  | ||||
| @ -450,9 +449,9 @@ class WikiqParser: | ||||
|                     state = mwpersistence.DiffState(SegmentMatcher(tokenizer=wikitext_split), | ||||
|                                                     revert_radius=PERSISTENCE_RADIUS) | ||||
|                 elif self.persist == PersistMethod.wikidiff: | ||||
|                     state = mwpersistence.DiffState(WikiDiffMatcher(self.wikidiff_url, | ||||
|                                                                     revision_texts, | ||||
|                                                                     tokenizer=wikitext_split), | ||||
|                     state = mwpersistence.DiffState(WikiDiffMatcher(revision_texts, | ||||
|                                                                     tokenizer=wikitext_split, | ||||
|                                                                     self.wikidiff_url), | ||||
|                                                     revert_radius=PERSISTENCE_RADIUS) | ||||
|                 else: | ||||
|                     from mw.lib import persistence | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user