make wikiq memory efficient again via batch processing.
This commit is contained in:
@@ -96,48 +96,6 @@ class DiffToOperationMap:
|
||||
del self.from_par_move_dict[rkey]
|
||||
break
|
||||
|
||||
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
||||
# print("Couldn't find exact matches for all parmoves!")
|
||||
# # we couldn't find all the matches via exact match
|
||||
# # let's try matching based on line number instead
|
||||
# lkeys_to_remove = []
|
||||
# for lkey, from_diff in self.from_par_move_dict.items():
|
||||
# from_linenum = from_diff["moveInfo"]["linkId"].split("_")[2]
|
||||
# rkey_to_remove = None
|
||||
# for rkey, to_diff in self.to_par_move_dict.items():
|
||||
# to_linenum = rkey.split("_")[2]
|
||||
# if from_linenum == to_linenum:
|
||||
# print("Matching on line number")
|
||||
# yield from self.doParMove(from_diff, to_diff)
|
||||
# rkey_to_remove = rkey
|
||||
# lkeys_to_remove.append(lkey)
|
||||
# break
|
||||
# if rkey_to_remove is not None:
|
||||
# del self.to_par_move_dict[rkey_to_remove]
|
||||
# for lkey in lkeys_to_remove:
|
||||
# del self.from_par_move_dict[lkey]
|
||||
|
||||
# if len(self.from_par_move_dict) > 0 or len(self.to_par_move_dict) > 0:
|
||||
# print("Couldn't find exact matches for all parmoves!")
|
||||
# # we couldn't find all the matches via exact match or line number
|
||||
# # let's try matching based on opIndex instead
|
||||
# lkeys_to_remove = []
|
||||
# for lkey, from_diff in self.from_par_move_dict.items():
|
||||
# rkey_to_remove = None
|
||||
# from_idx = from_diff["moveInfo"]["linkId"].split("_")[1]
|
||||
# for rkey, to_diff in self.to_par_move_dict.items():
|
||||
# to_idx = rkey.split("_")[1]
|
||||
# print(from_idx)
|
||||
# print(to_idx)
|
||||
# if from_idx == to_idx:
|
||||
# yield from self.doParMove(from_diff, to_diff)
|
||||
# rkey_to_remove = rkey
|
||||
# lkeys_to_remove.append(lkey)
|
||||
# if rkey_to_remove is not None:
|
||||
# del self.to_par_move_dict[rkey_to_remove]
|
||||
# for lkey in lkeys_to_remove:
|
||||
# del self.from_par_move_dict[lkey]
|
||||
|
||||
# we couldn't find matches. treat type 4 as removal and type 5 as highlight.
|
||||
for from_diff in self.from_par_move_dict.values():
|
||||
yield from self.doDelete(from_diff)
|
||||
@@ -368,22 +326,21 @@ class DiffToOperationMap:
|
||||
class WikiDiffMatcher:
|
||||
def __init__(
|
||||
self,
|
||||
texts: list[str] = None,
|
||||
tokenizer: Optional[RegexTokenizer] = None,
|
||||
):
|
||||
differ = pywikidiff2.pywikidiff2(
|
||||
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
|
||||
)
|
||||
# Pre-compute diffs to reduce traffic overhead.
|
||||
self.diffs = [json.loads(diff) for diff in differ.inline_json_diff_sequence(list(texts))]
|
||||
self.tokenizer = tokenizer or TOKENIZER
|
||||
|
||||
|
||||
|
||||
class Processor(DiffEngine.Processor):
|
||||
def __init__(self, texts, tokenizer=None):
|
||||
self.diffs = iter(texts)
|
||||
def __init__(self, tokenizer=None):
|
||||
self.tokenizer = tokenizer or TOKENIZER
|
||||
self.last_tokens = []
|
||||
self.previous_text = ""
|
||||
self.differ = pywikidiff2.pywikidiff2(
|
||||
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
|
||||
)
|
||||
self.last_diff = None
|
||||
|
||||
def update(self, last_tokens):
|
||||
self.last_tokens = last_tokens
|
||||
@@ -391,7 +348,8 @@ class WikiDiffMatcher:
|
||||
def process(self, text, token_class=None):
|
||||
# The diff has already been computed, but we need to incrementally
|
||||
# retrieve it to recreate the behavior DiffState expects.
|
||||
diff = next(self.diffs)
|
||||
diff = json.loads(self.differ.inline_json_diff(self.previous_text, text))
|
||||
self.last_diff = diff
|
||||
diffToOperationsMapper = DiffToOperationMap(diff, self.tokenizer)
|
||||
|
||||
diffops = list(diffToOperationsMapper.to_operations())
|
||||
@@ -444,7 +402,7 @@ class WikiDiffMatcher:
|
||||
return border_ops, self.last_tokens, tokens
|
||||
|
||||
def processor(self, *args, **kwargs):
|
||||
return self.Processor(self.diffs, self.tokenizer)
|
||||
return self.Processor(self.tokenizer)
|
||||
|
||||
def process(self):
|
||||
# DiffState checks for this method even though it is not called.
|
||||
|
||||
Reference in New Issue
Block a user