Timeout diffs.

This commit is contained in:
Nathan TeBlunthuis 2025-08-03 20:02:18 -07:00
parent 730c678f51
commit a563eaf6fc

View File

@ -23,13 +23,14 @@ import pywikidiff2
from deltas.tokenizers import wikitext_split
from more_itertools import ichunked
from mwxml import Dump
import asyncio
import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7
DIFF_TIMEOUT = 60*20
from pathlib import Path
import pyarrow as pa
@ -46,6 +47,15 @@ class PersistMethod:
wikidiff2 = 4
async def diff_async(differ, last_text, text):
async def _diff():
return differ.inline_json_diff(last_text, text)
try:
result = await asyncio.wait_for(_diff(), DIFF_TIMEOUT)
except asyncio.TimeoutError:
raise
return result
def calculate_persistence(tokens_added):
return (sum([(len(x.revisions) - 1) for x in tokens_added]), len(tokens_added))
@ -493,11 +503,21 @@ class WikiqParser:
num_context_lines=1000000,
max_word_level_diff_complexity=-1,
moved_paragraph_detection_cutoff=-1,
words_cache_capacity=10000,
diff_cache_capacity=10000,
stats_cache_capacity=10000,
)
fast_differ = pywikidiff2.pywikidiff2(
num_context_lines=1000000,
max_word_level_diff_complexity=40000000,
moved_paragraph_detection_cutoff=100,
words_cache_capacity=-1,
diff_cache_capacity=-1,
stats_cache_capacity=-1,
)
while not on_last_batch:
# first loop: next_batch <- batch;
# second loop: next_batch <- batch; evaluate next_batch.
@ -657,7 +677,12 @@ class WikiqParser:
last_text = last_rev_text
new_diffs = []
for text in row_buffer["text"]:
new_diffs.append(differ.inline_json_diff(last_text, text))
try:
diff = asyncio.run(diff_async(differ, last_text, text))
except asyncio.TimeoutError:
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid']}. Falling back to default limits.". file=sys.stderr)
diff = fast_differ.inline_json_diff(last_text, text)
new_diffs.append(diff)
last_text = text
row_buffer["diff"] = [
[