use the wikidiff2 diff timeout instead of async.

This commit is contained in:
Nathan TeBlunthuis 2025-12-13 14:29:16 -08:00
parent 5d1a246898
commit 2c54425726

View File

@ -10,6 +10,7 @@ import os.path
import re
import signal
import sys
import threading
import time
from collections import deque
from hashlib import sha1
@ -25,7 +26,6 @@ import pywikidiff2
from deltas.tokenizers import wikitext_split
from more_itertools import ichunked
from mwxml import Dump
import asyncio
import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
@ -40,7 +40,7 @@ from wikiq.resume import (
TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7
DIFF_TIMEOUT = 60
DIFF_TIMEOUT_MS = 60000
from pathlib import Path
import pyarrow as pa
@ -57,17 +57,10 @@ class PersistMethod:
wikidiff2 = 4
async def diff_async(differ, last_text, text):
"""Returns (result, timed_out) tuple."""
try:
loop = asyncio.get_running_loop()
result = await asyncio.wait_for(
asyncio.to_thread(differ.inline_json_diff, last_text, text),
timeout=DIFF_TIMEOUT
)
return result, False
except TimeoutError as e:
return None, True
def diff_with_timeout(differ, last_text, text):
"""Returns (result, timed_out) tuple using native pywikidiff2 timeout."""
result = differ.inline_json_diff(last_text, text, timeout_ms=DIFF_TIMEOUT_MS)
return result, differ.timed_out()
def calculate_persistence(tokens_added):
@ -286,7 +279,6 @@ class WikiqParser:
self.headings = headings
self.shutdown_requested = False
self.time_limit_seconds = time_limit_seconds
self.start_time = None
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
@ -323,16 +315,26 @@ class WikiqParser:
"""Request graceful shutdown. The process() method will exit after completing the current batch."""
self.shutdown_requested = True
def _check_time_limit(self):
"""Check if the time limit has been exceeded. If so, request shutdown."""
if self.time_limit_seconds is None or self.start_time is None:
return
elapsed = time.time() - self.start_time
if elapsed >= self.time_limit_seconds:
hours = elapsed / 3600
print(f"Time limit of {self.time_limit_seconds/3600:.2f} hours reached ({hours:.2f} hours elapsed), requesting shutdown...", file=sys.stderr)
def _time_limit_expired(self):
"""Timer callback when time limit is reached."""
hours = self.time_limit_seconds / 3600
print(f"Time limit of {hours:.2f} hours reached, requesting shutdown...", file=sys.stderr)
self.request_shutdown()
def _start_time_limit_timer(self):
"""Start a background timer to trigger shutdown when time limit is reached."""
if self.time_limit_seconds is None:
return None
timer = threading.Timer(self.time_limit_seconds, self._time_limit_expired)
timer.daemon = True
timer.start()
return timer
def _cancel_time_limit_timer(self, timer):
"""Cancel the time limit timer if it's still running."""
if timer is not None:
timer.cancel()
def _open_checkpoint(self, output_file):
"""Open checkpoint file for writing. Keeps file open for performance."""
if not self.output_parquet or output_file == sys.stdout.buffer:
@ -438,8 +440,8 @@ class WikiqParser:
return default_ns
def process(self):
# Record start time for time limit checking
self.start_time = time.time()
# Start time limit timer if configured
time_limit_timer = self._start_time_limit_timer()
# Track whether we've passed the resume point
# For partitioned output, this is a dict mapping namespace -> bool
@ -753,9 +755,6 @@ class WikiqParser:
if self.shutdown_requested:
break
# Check time limit after each batch
self._check_time_limit()
# If shutdown requested, skip all remaining processing and close writers
if self.shutdown_requested:
print("Shutdown requested, closing writers...", file=sys.stderr)
@ -876,7 +875,7 @@ class WikiqParser:
new_diffs = []
diff_timeouts = []
for i, text in enumerate(row_buffer["text"]):
diff, timed_out = asyncio.run(diff_async(differ, last_text, text))
diff, timed_out = diff_with_timeout(differ, last_text, text)
if timed_out:
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid'][i]}. Falling back to default limits.", file=sys.stderr)
diff = fast_differ.inline_json_diff(last_text, text)
@ -940,6 +939,9 @@ class WikiqParser:
break
page_count += 1
# Cancel time limit timer
self._cancel_time_limit_timer(time_limit_timer)
print(
"Done: %s revisions and %s pages." % (rev_count, page_count),
file=sys.stderr,