make a new pywikidiff2 object for each revision to reduce memory.

This commit is contained in:
Nathan TeBlunthuis 2025-07-22 09:50:30 -07:00
parent d20075b323
commit 6557e25af7

View File

@ -464,10 +464,6 @@ class WikiqParser:
next_batch = {} next_batch = {}
diff_dict = {} diff_dict = {}
if self.diff:
differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
)
if self.persist != PersistMethod.none: if self.persist != PersistMethod.none:
window = deque(maxlen=PERSISTENCE_RADIUS) window = deque(maxlen=PERSISTENCE_RADIUS)
@ -652,6 +648,11 @@ class WikiqParser:
last_text = last_rev_text last_text = last_rev_text
new_diffs = [] new_diffs = []
for text in row_buffer["text"]: for text in row_buffer["text"]:
differ = pywikidiff2.pywikidiff2(
numContextLines=1000000, moved_paragraph_detection_cutoff=200000
)
new_diffs.append(differ.inline_json_diff(last_text, text)) new_diffs.append(differ.inline_json_diff(last_text, text))
last_text = text last_text = text
row_buffer["diff"] = [ row_buffer["diff"] = [