Timeout diffs.

This commit is contained in:
Nathan TeBlunthuis 2025-08-03 20:02:18 -07:00
parent 730c678f51
commit a563eaf6fc

View File

@ -23,13 +23,14 @@ import pywikidiff2
from deltas.tokenizers import wikitext_split from deltas.tokenizers import wikitext_split
from more_itertools import ichunked from more_itertools import ichunked
from mwxml import Dump from mwxml import Dump
import asyncio
import wikiq.tables as tables import wikiq.tables as tables
from wikiq.tables import RevisionTable from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher from wikiq.wiki_diff_matcher import WikiDiffMatcher
TO_ENCODE = ("title", "editor") TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7 PERSISTENCE_RADIUS = 7
DIFF_TIMEOUT = 60*20
from pathlib import Path from pathlib import Path
import pyarrow as pa import pyarrow as pa
@ -46,6 +47,15 @@ class PersistMethod:
wikidiff2 = 4 wikidiff2 = 4
async def diff_async(differ, last_text, text):
async def _diff():
return differ.inline_json_diff(last_text, text)
try:
result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT)
except asyncio.TimeoutError:
raise
return result
def calculate_persistence(tokens_added): def calculate_persistence(tokens_added):
return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added)) return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))
@ -493,11 +503,21 @@ class WikiqParser:
num_context_lines=1000000, num_context_lines=1000000,
max_word_level_diff_complexity=-1, max_word_level_diff_complexity=-1,
moved_paragraph_detection_cutoff=-1, moved_paragraph_detection_cutoff=-1,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=10000,
)
fast_differ = pywikidiff2.pywikidiff2(
num_context_lines=1000000,
max_word_level_diff_complexity=40000000,
moved_paragraph_detection_cutoff=100,
words_cache_capacity=-1, words_cache_capacity=-1,
diff_cache_capacity=-1, diff_cache_capacity=-1,
stats_cache_capacity=-1, stats_cache_capacity=-1,
) )
while not on_last_batch: while not on_last_batch:
# first loop: next_batch <- batch; # first loop: next_batch <- batch;
# second loop: next_batch <- batch; evaluate next_batch. # second loop: next_batch <- batch; evaluate next_batch.
@ -657,7 +677,12 @@ class WikiqParser:
last_text = last_rev_text last_text = last_rev_text
new_diffs = [] new_diffs = []
for text in row_buffer["text"]: for text in row_buffer["text"]:
new_diffs.append(differ.inline_json_diff(last_text, text)) try:
diff = asyncio.run(diff_async(differ, last_text, text))
except asyncio.TimeoutError:
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr)
diff = fast_differ.inline_json_diff(last_text, text)
new_diffs.append(diff)
last_text = text last_text = text
row_buffer["diff"] = [ row_buffer["diff"] = [
[ [