use signalling to timeout mwparserfromhell instead of asyncio.

This commit is contained in:
Nathan TeBlunthuis
2026-01-07 12:42:37 -08:00
parent 4b8288c016
commit c7eb374ceb

View File

@@ -1,7 +1,8 @@
"""Shared wikitext parser with caching to avoid duplicate parsing."""
from __future__ import annotations
import asyncio
import signal
import mwparserfromhell
PARSER_TIMEOUT = 60 # seconds
@@ -22,22 +23,28 @@ class WikitextParser:
self._cached_wikicode = None
self.last_parse_timed_out: bool = False
async def _parse_async(self, text: str):
"""Parse wikitext with timeout protection."""
try:
result = await asyncio.wait_for(
asyncio.to_thread(mwparserfromhell.parse, text),
timeout=PARSER_TIMEOUT
)
return result, False
except TimeoutError:
return None, True
def _timeout_handler(self, signum, frame):
raise TimeoutError("mwparserfromhell parse exceeded timeout")
def _get_wikicode(self, text: str):
"""Parse text and cache result. Returns cached result if text unchanged."""
if text != self._cached_text:
if text == self._cached_text:
return self._cached_wikicode
old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
signal.alarm(PARSER_TIMEOUT)
try:
self._cached_wikicode = mwparserfromhell.parse(text)
self._cached_text = text
self._cached_wikicode, self.last_parse_timed_out = asyncio.run(self._parse_async(text))
self.last_parse_timed_out = False
except TimeoutError:
self._cached_wikicode = None
self._cached_text = text
self.last_parse_timed_out = True
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, old_handler)
return self._cached_wikicode
def extract_external_links(self, text: str | None) -> list[str] | None: