extract wikilinks.
This commit is contained in:
@@ -245,6 +245,7 @@ class WikiqParser:
|
||||
resume_from_revid: int = None,
|
||||
external_links: bool = False,
|
||||
citations: bool = False,
|
||||
wikilinks: bool = False,
|
||||
):
|
||||
"""
|
||||
Parameters:
|
||||
@@ -263,6 +264,7 @@ class WikiqParser:
|
||||
self.resume_from_revid = resume_from_revid
|
||||
self.external_links = external_links
|
||||
self.citations = citations
|
||||
self.wikilinks = wikilinks
|
||||
if namespaces is not None:
|
||||
self.namespace_filter = set(namespaces)
|
||||
else:
|
||||
@@ -402,8 +404,8 @@ class WikiqParser:
|
||||
if self.collapse_user:
|
||||
table.columns.append(tables.RevisionCollapsed())
|
||||
|
||||
# Create shared parser if either wikitext feature is enabled
|
||||
if self.external_links or self.citations:
|
||||
# Create shared parser if any wikitext feature is enabled
|
||||
if self.external_links or self.citations or self.wikilinks:
|
||||
wikitext_parser = WikitextParser()
|
||||
|
||||
if self.external_links:
|
||||
@@ -412,6 +414,9 @@ class WikiqParser:
|
||||
if self.citations:
|
||||
table.columns.append(tables.RevisionCitations(wikitext_parser))
|
||||
|
||||
if self.wikilinks:
|
||||
table.columns.append(tables.RevisionWikilinks(wikitext_parser))
|
||||
|
||||
# extract list of namespaces
|
||||
self.namespaces = {
|
||||
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
|
||||
@@ -1166,6 +1171,14 @@ def main():
|
||||
help="Extract citations (ref tags and cite templates) from each revision.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--wikilinks",
|
||||
dest="wikilinks",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Extract internal wikilinks from each revision.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-PNS",
|
||||
"--partition-namespaces",
|
||||
@@ -1272,6 +1285,7 @@ def main():
|
||||
resume_from_revid=resume_from_revid,
|
||||
external_links=args.external_links,
|
||||
citations=args.citations,
|
||||
wikilinks=args.wikilinks,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
@@ -1301,6 +1315,7 @@ def main():
|
||||
resume_from_revid=None,
|
||||
external_links=args.external_links,
|
||||
citations=args.citations,
|
||||
wikilinks=args.wikilinks,
|
||||
)
|
||||
|
||||
wikiq.process()
|
||||
|
||||
@@ -252,3 +252,23 @@ class RevisionCitations(RevisionField[Union[list[str], None]]):
|
||||
if revision.deleted.text:
|
||||
return None
|
||||
return self.wikitext_parser.extract_citations(revision.text)
|
||||
|
||||
|
||||
class RevisionWikilinks(RevisionField[Union[list[dict], None]]):
|
||||
"""Extract all internal wikilinks from revision text."""
|
||||
|
||||
# Struct type with title and optional display text
|
||||
field = pa.field("wikilinks", pa.list_(pa.struct([
|
||||
pa.field("title", pa.string()),
|
||||
pa.field("text", pa.string(), nullable=True),
|
||||
])), nullable=True)
|
||||
|
||||
def __init__(self, wikitext_parser: "WikitextParser"):
|
||||
super().__init__()
|
||||
self.wikitext_parser = wikitext_parser
|
||||
|
||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[dict], None]:
|
||||
revision = revisions[-1]
|
||||
if revision.deleted.text:
|
||||
return None
|
||||
return self.wikitext_parser.extract_wikilinks(revision.text)
|
||||
|
||||
@@ -59,3 +59,19 @@ class WikitextParser:
|
||||
return citations
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def extract_wikilinks(self, text: str | None) -> list[dict[str, str | None]] | None:
|
||||
"""Extract all internal wikilinks with title and display text."""
|
||||
if text is None:
|
||||
return None
|
||||
try:
|
||||
wikicode = self._get_wikicode(text)
|
||||
result = []
|
||||
for link in wikicode.filter_wikilinks():
|
||||
title = str(link.title).strip()
|
||||
# text is None if no pipe, otherwise the display text
|
||||
display_text = str(link.text).strip() if link.text else None
|
||||
result.append({"title": title, "text": display_text})
|
||||
return result
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user