extract wikilinks.
This commit is contained in:
parent
329341efb6
commit
d3517ed5ca
@ -245,6 +245,7 @@ class WikiqParser:
|
|||||||
resume_from_revid: int = None,
|
resume_from_revid: int = None,
|
||||||
external_links: bool = False,
|
external_links: bool = False,
|
||||||
citations: bool = False,
|
citations: bool = False,
|
||||||
|
wikilinks: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Parameters:
|
Parameters:
|
||||||
@ -263,6 +264,7 @@ class WikiqParser:
|
|||||||
self.resume_from_revid = resume_from_revid
|
self.resume_from_revid = resume_from_revid
|
||||||
self.external_links = external_links
|
self.external_links = external_links
|
||||||
self.citations = citations
|
self.citations = citations
|
||||||
|
self.wikilinks = wikilinks
|
||||||
if namespaces is not None:
|
if namespaces is not None:
|
||||||
self.namespace_filter = set(namespaces)
|
self.namespace_filter = set(namespaces)
|
||||||
else:
|
else:
|
||||||
@ -402,8 +404,8 @@ class WikiqParser:
|
|||||||
if self.collapse_user:
|
if self.collapse_user:
|
||||||
table.columns.append(tables.RevisionCollapsed())
|
table.columns.append(tables.RevisionCollapsed())
|
||||||
|
|
||||||
# Create shared parser if either wikitext feature is enabled
|
# Create shared parser if any wikitext feature is enabled
|
||||||
if self.external_links or self.citations:
|
if self.external_links or self.citations or self.wikilinks:
|
||||||
wikitext_parser = WikitextParser()
|
wikitext_parser = WikitextParser()
|
||||||
|
|
||||||
if self.external_links:
|
if self.external_links:
|
||||||
@ -412,6 +414,9 @@ class WikiqParser:
|
|||||||
if self.citations:
|
if self.citations:
|
||||||
table.columns.append(tables.RevisionCitations(wikitext_parser))
|
table.columns.append(tables.RevisionCitations(wikitext_parser))
|
||||||
|
|
||||||
|
if self.wikilinks:
|
||||||
|
table.columns.append(tables.RevisionWikilinks(wikitext_parser))
|
||||||
|
|
||||||
# extract list of namespaces
|
# extract list of namespaces
|
||||||
self.namespaces = {
|
self.namespaces = {
|
||||||
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
|
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
|
||||||
@ -1166,6 +1171,14 @@ def main():
|
|||||||
help="Extract citations (ref tags and cite templates) from each revision.",
|
help="Extract citations (ref tags and cite templates) from each revision.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--wikilinks",
|
||||||
|
dest="wikilinks",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Extract internal wikilinks from each revision.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-PNS",
|
"-PNS",
|
||||||
"--partition-namespaces",
|
"--partition-namespaces",
|
||||||
@ -1272,6 +1285,7 @@ def main():
|
|||||||
resume_from_revid=resume_from_revid,
|
resume_from_revid=resume_from_revid,
|
||||||
external_links=args.external_links,
|
external_links=args.external_links,
|
||||||
citations=args.citations,
|
citations=args.citations,
|
||||||
|
wikilinks=args.wikilinks,
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
@ -1301,6 +1315,7 @@ def main():
|
|||||||
resume_from_revid=None,
|
resume_from_revid=None,
|
||||||
external_links=args.external_links,
|
external_links=args.external_links,
|
||||||
citations=args.citations,
|
citations=args.citations,
|
||||||
|
wikilinks=args.wikilinks,
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
|||||||
@ -252,3 +252,23 @@ class RevisionCitations(RevisionField[Union[list[str], None]]):
|
|||||||
if revision.deleted.text:
|
if revision.deleted.text:
|
||||||
return None
|
return None
|
||||||
return self.wikitext_parser.extract_citations(revision.text)
|
return self.wikitext_parser.extract_citations(revision.text)
|
||||||
|
|
||||||
|
|
||||||
|
class RevisionWikilinks(RevisionField[Union[list[dict], None]]):
|
||||||
|
"""Extract all internal wikilinks from revision text."""
|
||||||
|
|
||||||
|
# Struct type with title and optional display text
|
||||||
|
field = pa.field("wikilinks", pa.list_(pa.struct([
|
||||||
|
pa.field("title", pa.string()),
|
||||||
|
pa.field("text", pa.string(), nullable=True),
|
||||||
|
])), nullable=True)
|
||||||
|
|
||||||
|
def __init__(self, wikitext_parser: "WikitextParser"):
|
||||||
|
super().__init__()
|
||||||
|
self.wikitext_parser = wikitext_parser
|
||||||
|
|
||||||
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[dict], None]:
|
||||||
|
revision = revisions[-1]
|
||||||
|
if revision.deleted.text:
|
||||||
|
return None
|
||||||
|
return self.wikitext_parser.extract_wikilinks(revision.text)
|
||||||
|
|||||||
@ -59,3 +59,19 @@ class WikitextParser:
|
|||||||
return citations
|
return citations
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def extract_wikilinks(self, text: str | None) -> list[dict[str, str | None]] | None:
|
||||||
|
"""Extract all internal wikilinks with title and display text."""
|
||||||
|
if text is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
wikicode = self._get_wikicode(text)
|
||||||
|
result = []
|
||||||
|
for link in wikicode.filter_wikilinks():
|
||||||
|
title = str(link.title).strip()
|
||||||
|
# text is None if no pipe, otherwise the display text
|
||||||
|
display_text = str(link.text).strip() if link.text else None
|
||||||
|
result.append({"title": title, "text": display_text})
|
||||||
|
return result
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|||||||
@ -831,3 +831,44 @@ def test_no_wikitext_columns():
|
|||||||
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
|
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
|
||||||
|
|
||||||
print(f"No wikitext columns test passed! {len(test)} rows processed")
|
print(f"No wikitext columns test passed! {len(test)} rows processed")
|
||||||
|
|
||||||
|
|
||||||
|
def test_wikilinks():
|
||||||
|
"""Test that --wikilinks extracts internal wikilinks correctly."""
|
||||||
|
import mwparserfromhell
|
||||||
|
|
||||||
|
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="parquet")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--wikilinks", "--text", "--fandom-2020")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
pytest.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||||
|
|
||||||
|
# Verify wikilinks column exists
|
||||||
|
assert "wikilinks" in test.columns, "wikilinks column should exist"
|
||||||
|
|
||||||
|
# Verify column has list/array type
|
||||||
|
assert test["wikilinks"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
|
||||||
|
|
||||||
|
# Verify extraction matches mwparserfromhell for sample rows
|
||||||
|
rows_with_links = test[test["wikilinks"].apply(lambda x: x is not None and len(x) > 0)]
|
||||||
|
if len(rows_with_links) > 0:
|
||||||
|
sample = rows_with_links.head(5)
|
||||||
|
for idx, row in sample.iterrows():
|
||||||
|
text = row["text"]
|
||||||
|
if text:
|
||||||
|
wikicode = mwparserfromhell.parse(text)
|
||||||
|
expected = []
|
||||||
|
for link in wikicode.filter_wikilinks():
|
||||||
|
title = str(link.title).strip()
|
||||||
|
display_text = str(link.text).strip() if link.text else None
|
||||||
|
expected.append({"title": title, "text": display_text})
|
||||||
|
|
||||||
|
actual = list(row["wikilinks"])
|
||||||
|
# Convert to comparable format (pandas may read as dicts or named tuples)
|
||||||
|
actual_dicts = [{"title": item["title"], "text": item["text"]} for item in actual]
|
||||||
|
assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch"
|
||||||
|
|
||||||
|
print(f"Wikilinks test passed! {len(test)} rows processed")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user