wikiq: skip parse on subsequent same-size revs after timeout

Per-page adaptive parse skip: once mwparserfromhell times out on a
revision of size N bytes, force-timeout future revisions on the same
page whose text is >= N bytes instead of waiting another 60s on each
one. State resets per page, so well-behaved pages are unaffected.

Motivated by zhwiki bot-vandalism log pages (e.g. ns=4
'Wikipedia:当前的破坏/...' at 4.6GB across 6133 revisions) where every
revision triggers the 60s SIGALRM, blocking the worker for hours.
This commit is contained in:
Nathan TeBlunthuis
2026-05-07 21:43:05 +00:00
parent 59fea1919e
commit 72410d090b
2 changed files with 63 additions and 3 deletions

View File

@@ -151,6 +151,7 @@ def build_table(
if collapse_user:
table.columns.append(tables.RevisionCollapsed())
wikitext_parser = None
if external_links or citations or wikilinks or templates or headings:
wikitext_parser = WikitextParser()
@@ -171,7 +172,7 @@ def build_table(
table.columns.append(tables.RevisionParserTimeout(wikitext_parser))
return table, reverts_column
return table, reverts_column, wikitext_parser
def build_schema(
@@ -770,7 +771,7 @@ class WikiqParser:
# Construct dump file iterator
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
table, reverts_column = build_table(
table, reverts_column, wikitext_parser = build_table(
text=self.text,
collapse_user=self.collapse_user,
external_links=self.external_links,
@@ -953,6 +954,15 @@ class WikiqParser:
persist_state = None
persist_window = None
# Adaptive parse-skip: once we observe a parse timeout on this
# page, record the smallest revision text size that timed out;
# subsequent revisions at or above that size short-circuit the
# parse instead of waiting for another 60s SIGALRM. Resets per
# page so this only suppresses pages that have already proven
# pathological (e.g. zhwiki bot-vandalism log pages).
page_min_timeout_size: Union[int, None] = None
page_skipped_parses = 0
if self.persist != PersistMethod.none:
persist_window = deque(maxlen=PERSISTENCE_RADIUS)
if self.persist == PersistMethod.sequence:
@@ -1010,9 +1020,40 @@ class WikiqParser:
rev_count += 1
# If this page has already produced a parse timeout, force
# the cache to "timed out" for any same-or-larger revision so
# extract_row returns nulls without spending another 60s on
# mwparserfromhell. See page_min_timeout_size init above.
if (
wikitext_parser is not None
and page_min_timeout_size is not None
and rev.text is not None
and len(rev.text) >= page_min_timeout_size
):
wikitext_parser.force_timeout(rev.text)
page_skipped_parses += 1
# Extract base row data
row = table.extract_row(page.mwpage, revs)
# Record the smallest size at which this page has timed out
# so far, so future revisions at >= that size are auto-skipped.
if (
wikitext_parser is not None
and wikitext_parser.last_parse_timed_out
and rev.text is not None
):
rev_size = len(rev.text)
if page_min_timeout_size is None or rev_size < page_min_timeout_size:
page_min_timeout_size = rev_size
print(
f" parse timeout on pageid={page.mwpage.id} "
f"title={page.mwpage.title!r} revid={rev.id} "
f"text_bytes={rev_size:,} — will skip future revs "
f">= {rev_size:,} bytes on this page",
file=sys.stderr, flush=True,
)
# Compute revert flag
if self.revert_radius == 0 or row["deleted"]:
row["revert"] = None
@@ -1110,6 +1151,12 @@ class WikiqParser:
if self.shutdown_requested:
break
page_count += 1
if page_skipped_parses > 0:
print(
f" pageid={page.mwpage.id} done, skipped {page_skipped_parses} "
f"parse(s) after timeout at {page_min_timeout_size:,} bytes",
file=sys.stderr, flush=True,
)
# Flush remaining buffer
flush_buffer()
@@ -1427,7 +1474,7 @@ def main():
regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label)
regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label)
table, _ = build_table(
table, _, _ = build_table(
text=args.text,
collapse_user=args.collapse_user,
external_links=args.external_links,

View File

@@ -26,6 +26,18 @@ class WikitextParser:
def _timeout_handler(self, signum, frame):
raise TimeoutError("mwparserfromhell parse exceeded timeout")
def force_timeout(self, text: str | None) -> None:
"""Pre-populate the cache as if a parse timeout occurred.
Used by callers that want to skip parsing for revisions predicted to
hang (e.g. on pages where an earlier same-size revision already timed
out). Subsequent extract_*() calls will see ``last_parse_timed_out``
and return None without invoking mwparserfromhell.
"""
self._cached_text = text
self._cached_wikicode = None
self.last_parse_timed_out = True
def _get_wikicode(self, text: str):
"""Parse text and cache result. Returns cached result if text unchanged."""
if text == self._cached_text:
@@ -33,6 +45,7 @@ class WikitextParser:
old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
signal.alarm(PARSER_TIMEOUT)
try:
self._cached_wikicode = mwparserfromhell.parse(text)
self._cached_text = text