diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index e5a84f3..9597cf9 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -245,6 +245,7 @@ class WikiqParser: resume_from_revid: int = None, external_links: bool = False, citations: bool = False, + wikilinks: bool = False, ): """ Parameters: @@ -263,6 +264,7 @@ class WikiqParser: self.resume_from_revid = resume_from_revid self.external_links = external_links self.citations = citations + self.wikilinks = wikilinks if namespaces is not None: self.namespace_filter = set(namespaces) else: @@ -402,8 +404,8 @@ class WikiqParser: if self.collapse_user: table.columns.append(tables.RevisionCollapsed()) - # Create shared parser if either wikitext feature is enabled - if self.external_links or self.citations: + # Create shared parser if any wikitext feature is enabled + if self.external_links or self.citations or self.wikilinks: wikitext_parser = WikitextParser() if self.external_links: @@ -412,6 +414,9 @@ class WikiqParser: if self.citations: table.columns.append(tables.RevisionCitations(wikitext_parser)) + if self.wikilinks: + table.columns.append(tables.RevisionWikilinks(wikitext_parser)) + # extract list of namespaces self.namespaces = { ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces @@ -1166,6 +1171,14 @@ def main(): help="Extract citations (ref tags and cite templates) from each revision.", ) + parser.add_argument( + "--wikilinks", + dest="wikilinks", + action="store_true", + default=False, + help="Extract internal wikilinks from each revision.", + ) + parser.add_argument( "-PNS", "--partition-namespaces", @@ -1272,6 +1285,7 @@ def main(): resume_from_revid=resume_from_revid, external_links=args.external_links, citations=args.citations, + wikilinks=args.wikilinks, ) wikiq.process() @@ -1301,6 +1315,7 @@ def main(): resume_from_revid=None, external_links=args.external_links, citations=args.citations, + wikilinks=args.wikilinks, ) wikiq.process() diff --git a/src/wikiq/tables.py b/src/wikiq/tables.py index fc3b730..1ea1eaf 100644 --- a/src/wikiq/tables.py +++ b/src/wikiq/tables.py @@ -252,3 +252,23 @@ class RevisionCitations(RevisionField[Union[list[str], None]]): if revision.deleted.text: return None return self.wikitext_parser.extract_citations(revision.text) + + +class RevisionWikilinks(RevisionField[Union[list[dict], None]]): + """Extract all internal wikilinks from revision text.""" + + # Struct type with title and optional display text + field = pa.field("wikilinks", pa.list_(pa.struct([ + pa.field("title", pa.string()), + pa.field("text", pa.string(), nullable=True), + ])), nullable=True) + + def __init__(self, wikitext_parser: "WikitextParser"): + super().__init__() + self.wikitext_parser = wikitext_parser + + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[dict], None]: + revision = revisions[-1] + if revision.deleted.text: + return None + return self.wikitext_parser.extract_wikilinks(revision.text) diff --git a/src/wikiq/wikitext_parser.py b/src/wikiq/wikitext_parser.py index a09e5da..4a69533 100644 --- a/src/wikiq/wikitext_parser.py +++ b/src/wikiq/wikitext_parser.py @@ -59,3 +59,19 @@ class WikitextParser: return citations except Exception: return None + + def extract_wikilinks(self, text: str | None) -> list[dict[str, str | None]] | None: + """Extract all internal wikilinks with title and display text.""" + if text is None: + return None + try: + wikicode = self._get_wikicode(text) + result = [] + for link in wikicode.filter_wikilinks(): + title = str(link.title).strip() + # text is None if no pipe, otherwise the display text + display_text = str(link.text).strip() if link.text else None + result.append({"title": title, "text": display_text}) + return result + except Exception: + return None diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index ca42709..d139074 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -831,3 +831,44 @@ def test_no_wikitext_columns(): assert "citations" not in test.columns, "citations column should NOT exist without --citations flag" print(f"No wikitext columns test passed! {len(test)} rows processed") + + +def test_wikilinks(): + """Test that --wikilinks extracts internal wikilinks correctly.""" + import mwparserfromhell + + tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="parquet") + + try: + tester.call_wikiq("--wikilinks", "--text", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + + # Verify wikilinks column exists + assert "wikilinks" in test.columns, "wikilinks column should exist" + + # Verify column has list/array type + assert test["wikilinks"].apply(lambda x: x is None or hasattr(x, '__len__')).all() + + # Verify extraction matches mwparserfromhell for sample rows + rows_with_links = test[test["wikilinks"].apply(lambda x: x is not None and len(x) > 0)] + if len(rows_with_links) > 0: + sample = rows_with_links.head(5) + for idx, row in sample.iterrows(): + text = row["text"] + if text: + wikicode = mwparserfromhell.parse(text) + expected = [] + for link in wikicode.filter_wikilinks(): + title = str(link.title).strip() + display_text = str(link.text).strip() if link.text else None + expected.append({"title": title, "text": display_text}) + + actual = list(row["wikilinks"]) + # Convert to comparable format (pandas may read as dicts or named tuples) + actual_dicts = [{"title": item["title"], "text": item["text"]} for item in actual] + assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch" + + print(f"Wikilinks test passed! {len(test)} rows processed")