extract wikilinks.

This commit is contained in:
Nathan TeBlunthuis
2025-12-02 14:09:29 -08:00
parent 329341efb6
commit d3517ed5ca
4 changed files with 94 additions and 2 deletions

View File

@@ -245,6 +245,7 @@ class WikiqParser:
resume_from_revid: int = None,
external_links: bool = False,
citations: bool = False,
wikilinks: bool = False,
):
"""
Parameters:
@@ -263,6 +264,7 @@ class WikiqParser:
self.resume_from_revid = resume_from_revid
self.external_links = external_links
self.citations = citations
self.wikilinks = wikilinks
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
@@ -402,8 +404,8 @@ class WikiqParser:
if self.collapse_user:
table.columns.append(tables.RevisionCollapsed())
# Create shared parser if either wikitext feature is enabled
if self.external_links or self.citations:
# Create shared parser if any wikitext feature is enabled
if self.external_links or self.citations or self.wikilinks:
wikitext_parser = WikitextParser()
if self.external_links:
@@ -412,6 +414,9 @@ class WikiqParser:
if self.citations:
table.columns.append(tables.RevisionCitations(wikitext_parser))
if self.wikilinks:
table.columns.append(tables.RevisionWikilinks(wikitext_parser))
# extract list of namespaces
self.namespaces = {
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
@@ -1166,6 +1171,14 @@ def main():
help="Extract citations (ref tags and cite templates) from each revision.",
)
parser.add_argument(
"--wikilinks",
dest="wikilinks",
action="store_true",
default=False,
help="Extract internal wikilinks from each revision.",
)
parser.add_argument(
"-PNS",
"--partition-namespaces",
@@ -1272,6 +1285,7 @@ def main():
resume_from_revid=resume_from_revid,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
)
wikiq.process()
@@ -1301,6 +1315,7 @@ def main():
resume_from_revid=None,
external_links=args.external_links,
citations=args.citations,
wikilinks=args.wikilinks,
)
wikiq.process()