diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 4c4e242..9ff63ee 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -151,6 +151,7 @@ def build_table( if collapse_user: table.columns.append(tables.RevisionCollapsed()) + wikitext_parser = None if external_links or citations or wikilinks or templates or headings: wikitext_parser = WikitextParser() @@ -171,7 +172,7 @@ def build_table( table.columns.append(tables.RevisionParserTimeout(wikitext_parser)) - return table, reverts_column + return table, reverts_column, wikitext_parser def build_schema( @@ -770,7 +771,7 @@ class WikiqParser: # Construct dump file iterator dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user) - table, reverts_column = build_table( + table, reverts_column, wikitext_parser = build_table( text=self.text, collapse_user=self.collapse_user, external_links=self.external_links, @@ -953,6 +954,15 @@ class WikiqParser: persist_state = None persist_window = None + # Adaptive parse-skip: once we observe a parse timeout on this + # page, record the smallest revision text size that timed out; + # subsequent revisions at or above that size short-circuit the + # parse instead of waiting for another 60s SIGALRM. Resets per + # page so this only suppresses pages that have already proven + # pathological (e.g. zhwiki bot-vandalism log pages). + page_min_timeout_size: Union[int, None] = None + page_skipped_parses = 0 + if self.persist != PersistMethod.none: persist_window = deque(maxlen=PERSISTENCE_RADIUS) if self.persist == PersistMethod.sequence: @@ -1010,9 +1020,40 @@ class WikiqParser: rev_count += 1 + # If this page has already produced a parse timeout, force + # the cache to "timed out" for any same-or-larger revision so + # extract_row returns nulls without spending another 60s on + # mwparserfromhell. See page_min_timeout_size init above. + if ( + wikitext_parser is not None + and page_min_timeout_size is not None + and rev.text is not None + and len(rev.text) >= page_min_timeout_size + ): + wikitext_parser.force_timeout(rev.text) + page_skipped_parses += 1 + # Extract base row data row = table.extract_row(page.mwpage, revs) + # Record the smallest size at which this page has timed out + # so far, so future revisions at >= that size are auto-skipped. + if ( + wikitext_parser is not None + and wikitext_parser.last_parse_timed_out + and rev.text is not None + ): + rev_size = len(rev.text) + if page_min_timeout_size is None or rev_size < page_min_timeout_size: + page_min_timeout_size = rev_size + print( + f" parse timeout on pageid={page.mwpage.id} " + f"title={page.mwpage.title!r} revid={rev.id} " + f"text_bytes={rev_size:,} — will skip future revs " + f">= {rev_size:,} bytes on this page", + file=sys.stderr, flush=True, + ) + # Compute revert flag if self.revert_radius == 0 or row["deleted"]: row["revert"] = None @@ -1110,6 +1151,12 @@ class WikiqParser: if self.shutdown_requested: break page_count += 1 + if page_skipped_parses > 0: + print( + f" pageid={page.mwpage.id} done, skipped {page_skipped_parses} " + f"parse(s) after timeout at {page_min_timeout_size:,} bytes", + file=sys.stderr, flush=True, + ) # Flush remaining buffer flush_buffer() @@ -1427,7 +1474,7 @@ def main(): regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label) regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label) - table, _ = build_table( + table, _, _ = build_table( text=args.text, collapse_user=args.collapse_user, external_links=args.external_links, diff --git a/src/wikiq/wikitext_parser.py b/src/wikiq/wikitext_parser.py index 9c68cec..82166bb 100644 --- a/src/wikiq/wikitext_parser.py +++ b/src/wikiq/wikitext_parser.py @@ -26,6 +26,18 @@ class WikitextParser: def _timeout_handler(self, signum, frame): raise TimeoutError("mwparserfromhell parse exceeded timeout") + def force_timeout(self, text: str | None) -> None: + """Pre-populate the cache as if a parse timeout occurred. + + Used by callers that want to skip parsing for revisions predicted to + hang (e.g. on pages where an earlier same-size revision already timed + out). Subsequent extract_*() calls will see ``last_parse_timed_out`` + and return None without invoking mwparserfromhell. + """ + self._cached_text = text + self._cached_wikicode = None + self.last_parse_timed_out = True + def _get_wikicode(self, text: str): """Parse text and cache result. Returns cached result if text unchanged.""" if text == self._cached_text: @@ -33,6 +45,7 @@ class WikitextParser: old_handler = signal.signal(signal.SIGALRM, self._timeout_handler) signal.alarm(PARSER_TIMEOUT) + try: self._cached_wikicode = mwparserfromhell.parse(text) self._cached_text = text