configure pywikidiff2 cache limits.
This commit is contained in:
parent
83c92d1a37
commit
54e996b910
@ -3,8 +3,8 @@
|
|||||||
# original wikiq headers are: title articleid revid date_time anon
|
# original wikiq headers are: title articleid revid date_time anon
|
||||||
# editor editor_id minor text_size text_entropy text_md5 reversion
|
# editor editor_id minor text_size text_entropy text_md5 reversion
|
||||||
# additions_size deletions_size
|
# additions_size deletions_size
|
||||||
import gc
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import gc
|
||||||
import json
|
import json
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
@ -465,7 +465,6 @@ class WikiqParser:
|
|||||||
next_batch = {}
|
next_batch = {}
|
||||||
diff_dict = {}
|
diff_dict = {}
|
||||||
|
|
||||||
|
|
||||||
if self.persist != PersistMethod.none:
|
if self.persist != PersistMethod.none:
|
||||||
window = deque(maxlen=PERSISTENCE_RADIUS)
|
window = deque(maxlen=PERSISTENCE_RADIUS)
|
||||||
if self.persist != PersistMethod.none:
|
if self.persist != PersistMethod.none:
|
||||||
@ -488,11 +487,14 @@ class WikiqParser:
|
|||||||
from mw.lib import persistence
|
from mw.lib import persistence
|
||||||
|
|
||||||
persist_state = persistence.State()
|
persist_state = persistence.State()
|
||||||
|
q
|
||||||
|
|
||||||
if self.diff:
|
if self.diff:
|
||||||
differ = pywikidiff2.pywikidiff2(
|
differ = pywikidiff2.pywikidiff2(
|
||||||
numContextLines=1000000, moved_paragraph_detection_cutoff=2000
|
numContextLines=1000000,
|
||||||
|
moved_paragraph_detection_cutoff=200000,
|
||||||
|
words_cache_capacity=10000,
|
||||||
|
diff_cache_capacity=10000,
|
||||||
|
stats_cache_capacity=100000,
|
||||||
)
|
)
|
||||||
|
|
||||||
while not on_last_batch:
|
while not on_last_batch:
|
||||||
@ -595,7 +597,6 @@ class WikiqParser:
|
|||||||
row_buffer[k] = v
|
row_buffer[k] = v
|
||||||
regex_matches = {}
|
regex_matches = {}
|
||||||
|
|
||||||
|
|
||||||
# begin persistence logic
|
# begin persistence logic
|
||||||
if self.persist != PersistMethod.none:
|
if self.persist != PersistMethod.none:
|
||||||
row_buffer["token_revs"] = []
|
row_buffer["token_revs"] = []
|
||||||
@ -937,7 +938,7 @@ def main():
|
|||||||
diff=args.diff,
|
diff=args.diff,
|
||||||
output_parquet=output_parquet,
|
output_parquet=output_parquet,
|
||||||
partition_namespaces=args.partition_namespaces,
|
partition_namespaces=args.partition_namespaces,
|
||||||
batch_size = args.batch_size
|
batch_size=args.batch_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
@ -960,7 +961,7 @@ def main():
|
|||||||
regex_comment_label=args.regex_comment_label,
|
regex_comment_label=args.regex_comment_label,
|
||||||
diff=args.diff,
|
diff=args.diff,
|
||||||
text=args.text,
|
text=args.text,
|
||||||
batch_size=args.batch_size
|
batch_size=args.batch_size,
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
@ -330,15 +330,17 @@ class WikiDiffMatcher:
|
|||||||
):
|
):
|
||||||
self.tokenizer = tokenizer or TOKENIZER
|
self.tokenizer = tokenizer or TOKENIZER
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Processor(DiffEngine.Processor):
|
class Processor(DiffEngine.Processor):
|
||||||
def __init__(self, tokenizer=None):
|
def __init__(self, tokenizer=None):
|
||||||
self.tokenizer = tokenizer or TOKENIZER
|
self.tokenizer = tokenizer or TOKENIZER
|
||||||
self.last_tokens = []
|
self.last_tokens = []
|
||||||
self.previous_text = ""
|
self.previous_text = ""
|
||||||
self.differ = pywikidiff2.pywikidiff2(
|
self.differ = pywikidiff2.pywikidiff2(
|
||||||
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
|
numContextLines=1000000,
|
||||||
|
moved_paragraph_detection_cutoff=200000,
|
||||||
|
words_cache_capacity=10000,
|
||||||
|
diff_cache_capacity=10000,
|
||||||
|
stats_cache_capacity=100000,
|
||||||
)
|
)
|
||||||
self.last_diff = None
|
self.last_diff = None
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user