configure pywikidiff2 cache limits.

This commit is contained in:
Nathan TeBlunthuis 2025-08-01 09:24:54 -07:00
parent 83c92d1a37
commit 54e996b910
2 changed files with 14 additions and 11 deletions

View File

@ -3,8 +3,8 @@
# original wikiq headers are: title articleid revid date_time anon # original wikiq headers are: title articleid revid date_time anon
# editor editor_id minor text_size text_entropy text_md5 reversion # editor editor_id minor text_size text_entropy text_md5 reversion
# additions_size deletions_size # additions_size deletions_size
import gc
import argparse import argparse
import gc
import json import json
import os.path import os.path
import re import re
@ -465,7 +465,6 @@ class WikiqParser:
next_batch = {} next_batch = {}
diff_dict = {} diff_dict = {}
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
window = deque(maxlen=PERSISTENCE_RADIUS) window = deque(maxlen=PERSISTENCE_RADIUS)
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
@ -488,11 +487,14 @@ class WikiqParser:
from mw.lib import persistence from mw.lib import persistence
persist_state = persistence.State() persist_state = persistence.State()
q
if self.diff: if self.diff:
differ = pywikidiff2.pywikidiff2( differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=2000 numContextLines=1000000,
moved_paragraph_detection_cutoff=200000,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=100000,
) )
while not on_last_batch: while not on_last_batch:
@ -595,7 +597,6 @@ class WikiqParser:
row_buffer[k] = v row_buffer[k] = v
regex_matches = {} regex_matches = {}
# begin persistence logic # begin persistence logic
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
row_buffer["token_revs"] = [] row_buffer["token_revs"] = []
@ -937,7 +938,7 @@ def main():
diff=args.diff, diff=args.diff,
output_parquet=output_parquet, output_parquet=output_parquet,
partition_namespaces=args.partition_namespaces, partition_namespaces=args.partition_namespaces,
batch_size = args.batch_size batch_size=args.batch_size,
) )
wikiq.process() wikiq.process()
@ -960,7 +961,7 @@ def main():
regex_comment_label=args.regex_comment_label, regex_comment_label=args.regex_comment_label,
diff=args.diff, diff=args.diff,
text=args.text, text=args.text,
batch_size=args.batch_size batch_size=args.batch_size,
) )
wikiq.process() wikiq.process()

View File

@ -330,15 +330,17 @@ class WikiDiffMatcher:
): ):
self.tokenizer = tokenizer or TOKENIZER self.tokenizer = tokenizer or TOKENIZER
class Processor(DiffEngine.Processor): class Processor(DiffEngine.Processor):
def __init__(self, tokenizer=None): def __init__(self, tokenizer=None):
self.tokenizer = tokenizer or TOKENIZER self.tokenizer = tokenizer or TOKENIZER
self.last_tokens = [] self.last_tokens = []
self.previous_text = "" self.previous_text = ""
self.differ = pywikidiff2.pywikidiff2( self.differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=200000 numContextLines=1000000,
moved_paragraph_detection_cutoff=200000,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=100000,
) )
self.last_diff = None self.last_diff = None