add templates and headings to wikiq.

This commit is contained in:
Nathan TeBlunthuis
2025-12-02 17:51:08 -08:00
parent d3517ed5ca
commit 5ce9808b50
5 changed files with 239 additions and 6 deletions

View File

@@ -49,14 +49,16 @@ class PersistMethod:
async def diff_async(differ, last_text, text):
"""Returns (result, timed_out) tuple."""
try:
loop = asyncio.get_running_loop()
return await asyncio.wait_for(
result = await asyncio.wait_for(
asyncio.to_thread(differ.inline_json_diff, last_text, text),
timeout=DIFF_TIMEOUT
)
return result, False
except TimeoutError as e:
return None
return None, True
def calculate_persistence(tokens_added):
@@ -246,6 +248,8 @@ class WikiqParser:
external_links: bool = False,
citations: bool = False,
wikilinks: bool = False,
templates: bool = False,
headings: bool = False,
):
"""
Parameters:
@@ -265,6 +269,8 @@ class WikiqParser:
self.external_links = external_links
self.citations = citations
self.wikilinks = wikilinks
self.templates = templates
self.headings = headings
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
@@ -405,7 +411,7 @@ class WikiqParser:
table.columns.append(tables.RevisionCollapsed())
# Create shared parser if any wikitext feature is enabled
if self.external_links or self.citations or self.wikilinks:
if self.external_links or self.citations or self.wikilinks or self.templates or self.headings:
wikitext_parser = WikitextParser()
if self.external_links:
@@ -417,6 +423,16 @@ class WikiqParser:
if self.wikilinks:
table.columns.append(tables.RevisionWikilinks(wikitext_parser))
if self.templates:
table.columns.append(tables.RevisionTemplates(wikitext_parser))
if self.headings:
table.columns.append(tables.RevisionHeadings(wikitext_parser))
# Add parser timeout tracking if any wikitext feature is enabled
if self.external_links or self.citations or self.wikilinks or self.templates or self.headings:
table.columns.append(tables.RevisionParserTimeout(wikitext_parser))
# extract list of namespaces
self.namespaces = {
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
@@ -434,6 +450,7 @@ class WikiqParser:
from wikiq.diff_pyarrow_schema import diff_field
schema = schema.append(diff_field)
schema = schema.append(pa.field("diff_timeout", pa.bool_()))
if self.diff and self.persist == PersistMethod.none:
table.columns.append(tables.RevisionText())
@@ -746,12 +763,14 @@ class WikiqParser:
if self.diff:
last_text = last_rev_text
new_diffs = []
diff_timeouts = []
for i, text in enumerate(row_buffer["text"]):
diff = asyncio.run(diff_async(differ, last_text, text))
if diff is None:
diff, timed_out = asyncio.run(diff_async(differ, last_text, text))
if timed_out:
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid'][i]}. Falling back to default limits.", file=sys.stderr)
diff = fast_differ.inline_json_diff(last_text, text)
new_diffs.append(diff)
diff_timeouts.append(timed_out)
last_text = text
row_buffer["diff"] = [
[
@@ -761,6 +780,7 @@ class WikiqParser:
]
for diff in new_diffs
]
row_buffer["diff_timeout"] = diff_timeouts
# end persistence logic
if self.diff or self.persist != PersistMethod.none:
@@ -1179,6 +1199,22 @@ def main():
help="Extract internal wikilinks from each revision.",
)
parser.add_argument(
"--templates",
dest="templates",
action="store_true",
default=False,
help="Extract templates with their parameters from each revision.",
)
parser.add_argument(
"--headings",
dest="headings",
action="store_true",
default=False,
help="Extract section headings from each revision.",
)
parser.add_argument(
"-PNS",
"--partition-namespaces",
@@ -1286,6 +1322,8 @@ def main():
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
templates=args.templates,
headings=args.headings,
)
wikiq.process()
@@ -1316,6 +1354,8 @@ def main():
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
templates=args.templates,
headings=args.headings,
)
wikiq.process()