configure pywikidiff2 cache limits.
This commit is contained in:
parent
83c92d1a37
commit
54e996b910
@ -3,8 +3,8 @@
|
||||
# original wikiq headers are: title articleid revid date_time anon
|
||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||
# additions_size deletions_size
|
||||
import gc
|
||||
import argparse
|
||||
import gc
|
||||
import json
|
||||
import os.path
|
||||
import re
|
||||
@ -465,7 +465,6 @@ class WikiqParser:
|
||||
next_batch = {}
|
||||
diff_dict = {}
|
||||
|
||||
|
||||
if self.persist != PersistMethod.none:
|
||||
window = deque(maxlen=PERSISTENCE_RADIUS)
|
||||
if self.persist != PersistMethod.none:
|
||||
@ -488,11 +487,14 @@ class WikiqParser:
|
||||
from mw.lib import persistence
|
||||
|
||||
persist_state = persistence.State()
|
||||
|
||||
|
||||
q
|
||||
if self.diff:
|
||||
differ = pywikidiff2.pywikidiff2(
|
||||
numContextLines=1000000, moved_paragraph_detection_cutoff=2000
|
||||
numContextLines=1000000,
|
||||
moved_paragraph_detection_cutoff=200000,
|
||||
words_cache_capacity=10000,
|
||||
diff_cache_capacity=10000,
|
||||
stats_cache_capacity=100000,
|
||||
)
|
||||
|
||||
while not on_last_batch:
|
||||
@ -595,7 +597,6 @@ class WikiqParser:
|
||||
row_buffer[k] = v
|
||||
regex_matches = {}
|
||||
|
||||
|
||||
# begin persistence logic
|
||||
if self.persist != PersistMethod.none:
|
||||
row_buffer["token_revs"] = []
|
||||
@ -937,7 +938,7 @@ def main():
|
||||
diff=args.diff,
|
||||
output_parquet=output_parquet,
|
||||
partition_namespaces=args.partition_namespaces,
|
||||
batch_size = args.batch_size
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
@ -960,7 +961,7 @@ def main():
|
||||
regex_comment_label=args.regex_comment_label,
|
||||
diff=args.diff,
|
||||
text=args.text,
|
||||
batch_size=args.batch_size
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
|
@ -330,15 +330,17 @@ class WikiDiffMatcher:
|
||||
):
|
||||
self.tokenizer = tokenizer or TOKENIZER
|
||||
|
||||
|
||||
|
||||
class Processor(DiffEngine.Processor):
|
||||
def __init__(self, tokenizer=None):
|
||||
self.tokenizer = tokenizer or TOKENIZER
|
||||
self.last_tokens = []
|
||||
self.previous_text = ""
|
||||
self.differ = pywikidiff2.pywikidiff2(
|
||||
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
|
||||
numContextLines=1000000,
|
||||
moved_paragraph_detection_cutoff=200000,
|
||||
words_cache_capacity=10000,
|
||||
diff_cache_capacity=10000,
|
||||
stats_cache_capacity=100000,
|
||||
)
|
||||
self.last_diff = None
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user