decrease moved paragraph detection cutoff to see if that fixes memory issue.

This commit is contained in:
Nathan TeBlunthuis 2025-07-22 13:29:01 -07:00
parent 076df15740
commit 83c92d1a37

View File

@ -489,6 +489,12 @@ class WikiqParser:
persist_state = persistence.State()
if self.diff:
differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=2000
)
while not on_last_batch:
# first loop: next_batch <- batch;
# second loop: next_batch <- batch; evaluate next_batch.
@ -649,11 +655,6 @@ class WikiqParser:
last_text = last_rev_text
new_diffs = []
for text in row_buffer["text"]:
differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
)
new_diffs.append(differ.inline_json_diff(last_text, text))
last_text = text
row_buffer["diff"] = [