add templates and headings to wikiq.
This commit is contained in:
@@ -49,14 +49,16 @@ class PersistMethod:
|
||||
|
||||
|
||||
async def diff_async(differ, last_text, text):
|
||||
"""Returns (result, timed_out) tuple."""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
return await asyncio.wait_for(
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.to_thread(differ.inline_json_diff, last_text, text),
|
||||
timeout=DIFF_TIMEOUT
|
||||
)
|
||||
return result, False
|
||||
except TimeoutError as e:
|
||||
return None
|
||||
return None, True
|
||||
|
||||
|
||||
def calculate_persistence(tokens_added):
|
||||
@@ -246,6 +248,8 @@ class WikiqParser:
|
||||
external_links: bool = False,
|
||||
citations: bool = False,
|
||||
wikilinks: bool = False,
|
||||
templates: bool = False,
|
||||
headings: bool = False,
|
||||
):
|
||||
"""
|
||||
Parameters:
|
||||
@@ -265,6 +269,8 @@ class WikiqParser:
|
||||
self.external_links = external_links
|
||||
self.citations = citations
|
||||
self.wikilinks = wikilinks
|
||||
self.templates = templates
|
||||
self.headings = headings
|
||||
if namespaces is not None:
|
||||
self.namespace_filter = set(namespaces)
|
||||
else:
|
||||
@@ -405,7 +411,7 @@ class WikiqParser:
|
||||
table.columns.append(tables.RevisionCollapsed())
|
||||
|
||||
# Create shared parser if any wikitext feature is enabled
|
||||
if self.external_links or self.citations or self.wikilinks:
|
||||
if self.external_links or self.citations or self.wikilinks or self.templates or self.headings:
|
||||
wikitext_parser = WikitextParser()
|
||||
|
||||
if self.external_links:
|
||||
@@ -417,6 +423,16 @@ class WikiqParser:
|
||||
if self.wikilinks:
|
||||
table.columns.append(tables.RevisionWikilinks(wikitext_parser))
|
||||
|
||||
if self.templates:
|
||||
table.columns.append(tables.RevisionTemplates(wikitext_parser))
|
||||
|
||||
if self.headings:
|
||||
table.columns.append(tables.RevisionHeadings(wikitext_parser))
|
||||
|
||||
# Add parser timeout tracking if any wikitext feature is enabled
|
||||
if self.external_links or self.citations or self.wikilinks or self.templates or self.headings:
|
||||
table.columns.append(tables.RevisionParserTimeout(wikitext_parser))
|
||||
|
||||
# extract list of namespaces
|
||||
self.namespaces = {
|
||||
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
|
||||
@@ -434,6 +450,7 @@ class WikiqParser:
|
||||
from wikiq.diff_pyarrow_schema import diff_field
|
||||
|
||||
schema = schema.append(diff_field)
|
||||
schema = schema.append(pa.field("diff_timeout", pa.bool_()))
|
||||
|
||||
if self.diff and self.persist == PersistMethod.none:
|
||||
table.columns.append(tables.RevisionText())
|
||||
@@ -746,12 +763,14 @@ class WikiqParser:
|
||||
if self.diff:
|
||||
last_text = last_rev_text
|
||||
new_diffs = []
|
||||
diff_timeouts = []
|
||||
for i, text in enumerate(row_buffer["text"]):
|
||||
diff = asyncio.run(diff_async(differ, last_text, text))
|
||||
if diff is None:
|
||||
diff, timed_out = asyncio.run(diff_async(differ, last_text, text))
|
||||
if timed_out:
|
||||
print(f"WARNING! wikidiff2 timeout for rev: {row_buffer['revid'][i]}. Falling back to default limits.", file=sys.stderr)
|
||||
diff = fast_differ.inline_json_diff(last_text, text)
|
||||
new_diffs.append(diff)
|
||||
diff_timeouts.append(timed_out)
|
||||
last_text = text
|
||||
row_buffer["diff"] = [
|
||||
[
|
||||
@@ -761,6 +780,7 @@ class WikiqParser:
|
||||
]
|
||||
for diff in new_diffs
|
||||
]
|
||||
row_buffer["diff_timeout"] = diff_timeouts
|
||||
|
||||
# end persistence logic
|
||||
if self.diff or self.persist != PersistMethod.none:
|
||||
@@ -1179,6 +1199,22 @@ def main():
|
||||
help="Extract internal wikilinks from each revision.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--templates",
|
||||
dest="templates",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Extract templates with their parameters from each revision.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--headings",
|
||||
dest="headings",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Extract section headings from each revision.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-PNS",
|
||||
"--partition-namespaces",
|
||||
@@ -1286,6 +1322,8 @@ def main():
|
||||
external_links=args.external_links,
|
||||
citations=args.citations,
|
||||
wikilinks=args.wikilinks,
|
||||
templates=args.templates,
|
||||
headings=args.headings,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
@@ -1316,6 +1354,8 @@ def main():
|
||||
external_links=args.external_links,
|
||||
citations=args.citations,
|
||||
wikilinks=args.wikilinks,
|
||||
templates=args.templates,
|
||||
headings=args.headings,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
|
||||
Reference in New Issue
Block a user