Compare commits
2 Commits
c7eb374ceb
...
jsonl-outp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
72410d090b | ||
|
|
59fea1919e |
@@ -133,6 +133,7 @@ def build_table(
|
|||||||
tables.RevisionArticleId(),
|
tables.RevisionArticleId(),
|
||||||
tables.RevisionPageTitle(),
|
tables.RevisionPageTitle(),
|
||||||
tables.RevisionNamespace(),
|
tables.RevisionNamespace(),
|
||||||
|
tables.RevisionRedirectTarget(),
|
||||||
tables.RevisionDeleted(),
|
tables.RevisionDeleted(),
|
||||||
tables.RevisionEditorId(),
|
tables.RevisionEditorId(),
|
||||||
tables.RevisionEditSummary(),
|
tables.RevisionEditSummary(),
|
||||||
@@ -150,6 +151,7 @@ def build_table(
|
|||||||
if collapse_user:
|
if collapse_user:
|
||||||
table.columns.append(tables.RevisionCollapsed())
|
table.columns.append(tables.RevisionCollapsed())
|
||||||
|
|
||||||
|
wikitext_parser = None
|
||||||
if external_links or citations or wikilinks or templates or headings:
|
if external_links or citations or wikilinks or templates or headings:
|
||||||
wikitext_parser = WikitextParser()
|
wikitext_parser = WikitextParser()
|
||||||
|
|
||||||
@@ -170,7 +172,7 @@ def build_table(
|
|||||||
|
|
||||||
table.columns.append(tables.RevisionParserTimeout(wikitext_parser))
|
table.columns.append(tables.RevisionParserTimeout(wikitext_parser))
|
||||||
|
|
||||||
return table, reverts_column
|
return table, reverts_column, wikitext_parser
|
||||||
|
|
||||||
|
|
||||||
def build_schema(
|
def build_schema(
|
||||||
@@ -769,7 +771,7 @@ class WikiqParser:
|
|||||||
# Construct dump file iterator
|
# Construct dump file iterator
|
||||||
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
|
dump = WikiqIterator(self.input_file, collapse_user=self.collapse_user)
|
||||||
|
|
||||||
table, reverts_column = build_table(
|
table, reverts_column, wikitext_parser = build_table(
|
||||||
text=self.text,
|
text=self.text,
|
||||||
collapse_user=self.collapse_user,
|
collapse_user=self.collapse_user,
|
||||||
external_links=self.external_links,
|
external_links=self.external_links,
|
||||||
@@ -952,6 +954,15 @@ class WikiqParser:
|
|||||||
persist_state = None
|
persist_state = None
|
||||||
persist_window = None
|
persist_window = None
|
||||||
|
|
||||||
|
# Adaptive parse-skip: once we observe a parse timeout on this
|
||||||
|
# page, record the smallest revision text size that timed out;
|
||||||
|
# subsequent revisions at or above that size short-circuit the
|
||||||
|
# parse instead of waiting for another 60s SIGALRM. Resets per
|
||||||
|
# page so this only suppresses pages that have already proven
|
||||||
|
# pathological (e.g. zhwiki bot-vandalism log pages).
|
||||||
|
page_min_timeout_size: Union[int, None] = None
|
||||||
|
page_skipped_parses = 0
|
||||||
|
|
||||||
if self.persist != PersistMethod.none:
|
if self.persist != PersistMethod.none:
|
||||||
persist_window = deque(maxlen=PERSISTENCE_RADIUS)
|
persist_window = deque(maxlen=PERSISTENCE_RADIUS)
|
||||||
if self.persist == PersistMethod.sequence:
|
if self.persist == PersistMethod.sequence:
|
||||||
@@ -1004,14 +1015,45 @@ class WikiqParser:
|
|||||||
if rev.id == page_resume_revid:
|
if rev.id == page_resume_revid:
|
||||||
found_resume_point = True
|
found_resume_point = True
|
||||||
is_resume_page = False
|
is_resume_page = False
|
||||||
print(f"Resuming output after revid {rev.id}", file=sys.stderr)
|
print(f"Resuming output after revid {rev.id}", file=sys.stderr, flush=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
rev_count += 1
|
rev_count += 1
|
||||||
|
|
||||||
|
# If this page has already produced a parse timeout, force
|
||||||
|
# the cache to "timed out" for any same-or-larger revision so
|
||||||
|
# extract_row returns nulls without spending another 60s on
|
||||||
|
# mwparserfromhell. See page_min_timeout_size init above.
|
||||||
|
if (
|
||||||
|
wikitext_parser is not None
|
||||||
|
and page_min_timeout_size is not None
|
||||||
|
and rev.text is not None
|
||||||
|
and len(rev.text) >= page_min_timeout_size
|
||||||
|
):
|
||||||
|
wikitext_parser.force_timeout(rev.text)
|
||||||
|
page_skipped_parses += 1
|
||||||
|
|
||||||
# Extract base row data
|
# Extract base row data
|
||||||
row = table.extract_row(page.mwpage, revs)
|
row = table.extract_row(page.mwpage, revs)
|
||||||
|
|
||||||
|
# Record the smallest size at which this page has timed out
|
||||||
|
# so far, so future revisions at >= that size are auto-skipped.
|
||||||
|
if (
|
||||||
|
wikitext_parser is not None
|
||||||
|
and wikitext_parser.last_parse_timed_out
|
||||||
|
and rev.text is not None
|
||||||
|
):
|
||||||
|
rev_size = len(rev.text)
|
||||||
|
if page_min_timeout_size is None or rev_size < page_min_timeout_size:
|
||||||
|
page_min_timeout_size = rev_size
|
||||||
|
print(
|
||||||
|
f" parse timeout on pageid={page.mwpage.id} "
|
||||||
|
f"title={page.mwpage.title!r} revid={rev.id} "
|
||||||
|
f"text_bytes={rev_size:,} — will skip future revs "
|
||||||
|
f">= {rev_size:,} bytes on this page",
|
||||||
|
file=sys.stderr, flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Compute revert flag
|
# Compute revert flag
|
||||||
if self.revert_radius == 0 or row["deleted"]:
|
if self.revert_radius == 0 or row["deleted"]:
|
||||||
row["revert"] = None
|
row["revert"] = None
|
||||||
@@ -1109,6 +1151,12 @@ class WikiqParser:
|
|||||||
if self.shutdown_requested:
|
if self.shutdown_requested:
|
||||||
break
|
break
|
||||||
page_count += 1
|
page_count += 1
|
||||||
|
if page_skipped_parses > 0:
|
||||||
|
print(
|
||||||
|
f" pageid={page.mwpage.id} done, skipped {page_skipped_parses} "
|
||||||
|
f"parse(s) after timeout at {page_min_timeout_size:,} bytes",
|
||||||
|
file=sys.stderr, flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
# Flush remaining buffer
|
# Flush remaining buffer
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
@@ -1426,7 +1474,7 @@ def main():
|
|||||||
regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label)
|
regex_revision_pairs = make_regex_pairs(args.regex_match_revision, args.regex_revision_label)
|
||||||
regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label)
|
regex_comment_pairs = make_regex_pairs(args.regex_match_comment, args.regex_comment_label)
|
||||||
|
|
||||||
table, _ = build_table(
|
table, _, _ = build_table(
|
||||||
text=args.text,
|
text=args.text,
|
||||||
collapse_user=args.collapse_user,
|
collapse_user=args.collapse_user,
|
||||||
external_links=args.external_links,
|
external_links=args.external_links,
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import sys
|
|
||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
@@ -144,6 +143,13 @@ class RevisionNamespace(RevisionField[int]):
|
|||||||
return page.namespace
|
return page.namespace
|
||||||
|
|
||||||
|
|
||||||
|
class RevisionRedirectTarget(RevisionField[Union[str, None]]):
|
||||||
|
field = pa.field("redirect_target", pa.string(), nullable=True)
|
||||||
|
|
||||||
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[str, None]:
|
||||||
|
return page.redirect
|
||||||
|
|
||||||
|
|
||||||
class RevisionSha1(RevisionField[str]):
|
class RevisionSha1(RevisionField[str]):
|
||||||
field = pa.field("sha1", pa.string())
|
field = pa.field("sha1", pa.string())
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,18 @@ class WikitextParser:
|
|||||||
def _timeout_handler(self, signum, frame):
|
def _timeout_handler(self, signum, frame):
|
||||||
raise TimeoutError("mwparserfromhell parse exceeded timeout")
|
raise TimeoutError("mwparserfromhell parse exceeded timeout")
|
||||||
|
|
||||||
|
def force_timeout(self, text: str | None) -> None:
|
||||||
|
"""Pre-populate the cache as if a parse timeout occurred.
|
||||||
|
|
||||||
|
Used by callers that want to skip parsing for revisions predicted to
|
||||||
|
hang (e.g. on pages where an earlier same-size revision already timed
|
||||||
|
out). Subsequent extract_*() calls will see ``last_parse_timed_out``
|
||||||
|
and return None without invoking mwparserfromhell.
|
||||||
|
"""
|
||||||
|
self._cached_text = text
|
||||||
|
self._cached_wikicode = None
|
||||||
|
self.last_parse_timed_out = True
|
||||||
|
|
||||||
def _get_wikicode(self, text: str):
|
def _get_wikicode(self, text: str):
|
||||||
"""Parse text and cache result. Returns cached result if text unchanged."""
|
"""Parse text and cache result. Returns cached result if text unchanged."""
|
||||||
if text == self._cached_text:
|
if text == self._cached_text:
|
||||||
@@ -33,6 +45,7 @@ class WikitextParser:
|
|||||||
|
|
||||||
old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
|
old_handler = signal.signal(signal.SIGALRM, self._timeout_handler)
|
||||||
signal.alarm(PARSER_TIMEOUT)
|
signal.alarm(PARSER_TIMEOUT)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._cached_wikicode = mwparserfromhell.parse(text)
|
self._cached_wikicode = mwparserfromhell.parse(text)
|
||||||
self._cached_text = text
|
self._cached_text = text
|
||||||
|
|||||||
Reference in New Issue
Block a user