From 83c92d1a375b70ab74d4597cfa01636f491275f1 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 22 Jul 2025 13:29:01 -0700 Subject: [PATCH] decrease moved paragraph detection cutoff to see if that fixes memory issue. --- src/wikiq/__init__.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 5047658..9dcc64b 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -489,6 +489,12 @@ class WikiqParser: persist_state = persistence.State() + + if self.diff: + differ = pywikidiff2.pywikidiff2( + numContextLines=1000000, moved_paragraph_detection_cutoff=2000 + ) + while not on_last_batch: # first loop: next_batch <- batch; # second loop: next_batch <- batch; evaluate next_batch. @@ -649,11 +655,6 @@ class WikiqParser: last_text = last_rev_text new_diffs = [] for text in row_buffer["text"]: - - differ = pywikidiff2.pywikidiff2( - numContextLines=1000000, moved_paragraph_detection_cutoff=200000 - ) - new_diffs.append(differ.inline_json_diff(last_text, text)) last_text = text row_buffer["diff"] = [