configure pywikidiff2 cache limits.

This commit is contained in:
Nathan TeBlunthuis 2025-08-01 09:24:54 -07:00
parent 83c92d1a37
commit 54e996b910
2 changed files with 14 additions and 11 deletions

View File

@ -3,8 +3,8 @@
# original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size
import gc
import argparse
import gc
import json
import os.path
import re
@ -465,7 +465,6 @@ class WikiqParser:
next_batch = {}
diff_dict = {}
if self.persist != PersistMethod.none:
window = deque(maxlen=PERSISTENCE_RADIUS)
if self.persist != PersistMethod.none:
@ -488,11 +487,14 @@ class WikiqParser:
from mw.lib import persistence
persist_state = persistence.State()
q
if self.diff:
differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=2000
numContextLines=1000000,
moved_paragraph_detection_cutoff=200000,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=100000,
)
while not on_last_batch:
@ -595,7 +597,6 @@ class WikiqParser:
row_buffer[k] = v
regex_matches = {}
# begin persistence logic
if self.persist != PersistMethod.none:
row_buffer["token_revs"] = []
@ -937,7 +938,7 @@ def main():
diff=args.diff,
output_parquet=output_parquet,
partition_namespaces=args.partition_namespaces,
batch_size = args.batch_size
batch_size=args.batch_size,
)
wikiq.process()
@ -960,7 +961,7 @@ def main():
regex_comment_label=args.regex_comment_label,
diff=args.diff,
text=args.text,
batch_size=args.batch_size
batch_size=args.batch_size,
)
wikiq.process()

View File

@ -330,15 +330,17 @@ class WikiDiffMatcher:
):
self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor):
def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or TOKENIZER
self.last_tokens = []
self.previous_text = ""
self.differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
numContextLines=1000000,
moved_paragraph_detection_cutoff=200000,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=100000,
)
self.last_diff = None