diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 9dcc64b..37f2e22 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -3,8 +3,8 @@ # original wikiq headers are: title articleid revid date_time anon # editor editor_id minor text_size text_entropy text_md5 reversion # additions_size deletions_size -import gc import argparse +import gc import json import os.path import re @@ -465,7 +465,6 @@ class WikiqParser: next_batch = {} diff_dict = {} - if self.persist != PersistMethod.none: window = deque(maxlen=PERSISTENCE_RADIUS) if self.persist != PersistMethod.none: @@ -488,11 +487,14 @@ class WikiqParser: from mw.lib import persistence persist_state = persistence.State() - - +q if self.diff: differ = pywikidiff2.pywikidiff2( - numContextLines=1000000, moved_paragraph_detection_cutoff=2000 + numContextLines=1000000, + moved_paragraph_detection_cutoff=200000, + words_cache_capacity=10000, + diff_cache_capacity=10000, + stats_cache_capacity=100000, ) while not on_last_batch: @@ -595,7 +597,6 @@ class WikiqParser: row_buffer[k] = v regex_matches = {} - # begin persistence logic if self.persist != PersistMethod.none: row_buffer["token_revs"] = [] @@ -937,7 +938,7 @@ def main(): diff=args.diff, output_parquet=output_parquet, partition_namespaces=args.partition_namespaces, - batch_size = args.batch_size + batch_size=args.batch_size, ) wikiq.process() @@ -960,7 +961,7 @@ def main(): regex_comment_label=args.regex_comment_label, diff=args.diff, text=args.text, - batch_size=args.batch_size + batch_size=args.batch_size, ) wikiq.process() diff --git a/src/wikiq/wiki_diff_matcher.py b/src/wikiq/wiki_diff_matcher.py index 6dfdab6..f9b8234 100644 --- a/src/wikiq/wiki_diff_matcher.py +++ b/src/wikiq/wiki_diff_matcher.py @@ -330,15 +330,17 @@ class WikiDiffMatcher: ): self.tokenizer = tokenizer or TOKENIZER - - class Processor(DiffEngine.Processor): def __init__(self, tokenizer=None): self.tokenizer = tokenizer or TOKENIZER self.last_tokens = [] self.previous_text = "" self.differ = pywikidiff2.pywikidiff2( - numContextLines=1000000, moved_paragraph_detection_cutoff=200000 + numContextLines=1000000, + moved_paragraph_detection_cutoff=200000, + words_cache_capacity=10000, + diff_cache_capacity=10000, + stats_cache_capacity=100000, ) self.last_diff = None