Start working on adding columns from mwparserfromhell.

2025-12-02 12:26:03 -08:00
parent b46f98a875
commit 76626a2785
5 changed files with 223 additions and 1 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
    "deltas>=0.7.0",
    "mediawiki-utilities>=0.4.18",
    "more-itertools>=10.7.0",
+    "mwparserfromhell>=0.6.0",
    "mwpersistence>=0.2.4",
    "mwreverts>=0.1.5",
    "mwtypes>=0.4.0",
--- a/src/wikiq/init.py
+++ b/src/wikiq/init.py
@@ -27,6 +27,7 @@ import asyncio
 import wikiq.tables as tables
 from wikiq.tables import RevisionTable
 from wikiq.wiki_diff_matcher import WikiDiffMatcher
+from wikiq.wikitext_parser import WikitextParser

 TO_ENCODE = ("title", "editor")
 PERSISTENCE_RADIUS = 7
@@ -242,6 +243,8 @@ class WikiqParser:
        batch_size: int = 1024,
        partition_namespaces: bool = False,
        resume_from_revid: int = None,
+        external_links: bool = False,
+        citations: bool = False,
    ):
        """
        Parameters:
@@ -258,6 +261,8 @@ class WikiqParser:
        self.text = text
        self.partition_namespaces = partition_namespaces
        self.resume_from_revid = resume_from_revid
+        self.external_links = external_links
+        self.citations = citations
        if namespaces is not None:
            self.namespace_filter = set(namespaces)
        else:
@@ -397,6 +402,16 @@ class WikiqParser:
        if self.collapse_user:
            table.columns.append(tables.RevisionCollapsed())

+        # Create shared parser if either wikitext feature is enabled
+        if self.external_links or self.citations:
+            wikitext_parser = WikitextParser()
+
+        if self.external_links:
+            table.columns.append(tables.RevisionExternalLinks(wikitext_parser))
+
+        if self.citations:
+            table.columns.append(tables.RevisionCitations(wikitext_parser))
+
        # extract list of namespaces
        self.namespaces = {
            ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
@@ -1135,6 +1150,22 @@ def main():
        help="Output the text of the revision.",
    )

+    parser.add_argument(
+        "--external-links",
+        dest="external_links",
+        action="store_true",
+        default=False,
+        help="Extract external links from each revision using mwparserfromhell.",
+    )
+
+    parser.add_argument(
+        "--citations",
+        dest="citations",
+        action="store_true",
+        default=False,
+        help="Extract citations (ref tags and cite templates) from each revision.",
+    )
+
    parser.add_argument(
        "-PNS",
        "--partition-namespaces",
@@ -1239,6 +1270,8 @@ def main():
                partition_namespaces=args.partition_namespaces,
                batch_size=args.batch_size,
                resume_from_revid=resume_from_revid,
+                external_links=args.external_links,
+                citations=args.citations,
            )

            wikiq.process()
@@ -1266,6 +1299,8 @@ def main():
            text=args.text,
            batch_size=args.batch_size,
            resume_from_revid=None,
+            external_links=args.external_links,
+            citations=args.citations,
        )

        wikiq.process()
--- a/src/wikiq/tables.py
+++ b/src/wikiq/tables.py
@@ -2,7 +2,7 @@ import sys
 from abc import abstractmethod, ABC
 from datetime import datetime, timezone
 from hashlib import sha1
-from typing import Generic, TypeVar, Union
+from typing import Generic, TypeVar, Union, TYPE_CHECKING

 import mwreverts
 import mwtypes
@@ -10,6 +10,9 @@ import mwxml

 import pyarrow as pa

+if TYPE_CHECKING:
+    from wikiq.wikitext_parser import WikitextParser
+
 T = TypeVar('T')


@@ -217,3 +220,35 @@ class RevisionText(RevisionField[str]):
    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
        revision = revisions[-1]
        return revision.text
+
+
+class RevisionExternalLinks(RevisionField[Union[list[str], None]]):
+    """Extract all external links from revision text."""
+
+    field = pa.field("external_links", pa.list_(pa.string()), nullable=True)
+
+    def __init__(self, wikitext_parser: "WikitextParser"):
+        super().__init__()
+        self.wikitext_parser = wikitext_parser
+
+    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
+        revision = revisions[-1]
+        if revision.deleted.text:
+            return None
+        return self.wikitext_parser.extract_external_links(revision.text)
+
+
+class RevisionCitations(RevisionField[Union[list[str], None]]):
+    """Extract citations from ref tags and cite templates."""
+
+    field = pa.field("citations", pa.list_(pa.string()), nullable=True)
+
+    def __init__(self, wikitext_parser: "WikitextParser"):
+        super().__init__()
+        self.wikitext_parser = wikitext_parser
+
+    def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
+        revision = revisions[-1]
+        if revision.deleted.text:
+            return None
+        return self.wikitext_parser.extract_citations(revision.text)
--- a/src/wikiq/wikitext_parser.py
+++ b/src/wikiq/wikitext_parser.py
@@ -0,0 +1,61 @@
+"""Shared wikitext parser with caching to avoid duplicate parsing."""
+from __future__ import annotations
+
+import mwparserfromhell
+
+
+class WikitextParser:
+    """Caches parsed wikicode to avoid re-parsing the same text."""
+
+    CITE_TEMPLATES = {
+        'cite web', 'cite book', 'cite journal', 'cite news',
+        'cite magazine', 'cite conference', 'cite encyclopedia',
+        'cite report', 'cite thesis', 'cite press release',
+        'citation', 'sfn', 'harvnb', 'harv'
+    }
+
+    def __init__(self):
+        self._cached_text: str | None = None
+        self._cached_wikicode = None
+
+    def _get_wikicode(self, text: str):
+        """Parse text and cache result. Returns cached result if text unchanged."""
+        if text != self._cached_text:
+            self._cached_text = text
+            self._cached_wikicode = mwparserfromhell.parse(text)
+        return self._cached_wikicode
+
+    def extract_external_links(self, text: str | None) -> list[str] | None:
+        """Extract all external link URLs from wikitext."""
+        if text is None:
+            return None
+        try:
+            wikicode = self._get_wikicode(text)
+            return [str(link.url) for link in wikicode.filter_external_links()]
+        except Exception:
+            return None
+
+    def extract_citations(self, text: str | None) -> list[str] | None:
+        """Extract citations from ref tags and cite templates."""
+        if text is None:
+            return None
+        try:
+            wikicode = self._get_wikicode(text)
+            citations = []
+
+            # Extract ref tag contents
+            for tag in wikicode.filter_tags():
+                if tag.tag.lower() == 'ref':
+                    content = str(tag.contents).strip()
+                    if content:
+                        citations.append(f"ref:{content}")
+
+            # Extract cite templates
+            for template in wikicode.filter_templates():
+                template_name = str(template.name).strip().lower()
+                if any(template_name.startswith(cite) for cite in self.CITE_TEMPLATES):
+                    citations.append(f"template:{str(template)}")
+
+            return citations
+        except Exception:
+            return None
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -630,3 +630,93 @@ def test_resume_with_partition_namespaces():
    assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"

    print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
+
+
+def test_external_links_only():
+    """Test that --external-links extracts external links without --citations."""
+    tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
+
+    try:
+        tester.call_wikiq("--external-links", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+
+    # Verify external_links column exists
+    assert "external_links" in test.columns, "external_links column should exist"
+
+    # Verify citations column does NOT exist
+    assert "citations" not in test.columns, "citations column should NOT exist when only --external-links is used"
+
+    # Verify column has list/array type (pandas reads parquet lists as numpy arrays)
+    assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
+        "external_links should be a list/array type or None"
+
+    print(f"External links only test passed! {len(test)} rows processed")
+
+
+def test_citations_only():
+    """Test that --citations extracts citations without --external-links."""
+    tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
+
+    try:
+        tester.call_wikiq("--citations", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+
+    # Verify citations column exists
+    assert "citations" in test.columns, "citations column should exist"
+
+    # Verify external_links column does NOT exist
+    assert "external_links" not in test.columns, "external_links column should NOT exist when only --citations is used"
+
+    # Verify column has list/array type (pandas reads parquet lists as numpy arrays)
+    assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
+        "citations should be a list/array type or None"
+
+    print(f"Citations only test passed! {len(test)} rows processed")
+
+
+def test_external_links_and_citations():
+    """Test that both --external-links and --citations work together (shared parser)."""
+    tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
+
+    try:
+        tester.call_wikiq("--external-links", "--citations", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+
+    # Verify both columns exist
+    assert "external_links" in test.columns, "external_links column should exist"
+    assert "citations" in test.columns, "citations column should exist"
+
+    # Verify both columns have list/array types (pandas reads parquet lists as numpy arrays)
+    assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
+        "external_links should be a list/array type or None"
+    assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
+        "citations should be a list/array type or None"
+
+    print(f"External links and citations test passed! {len(test)} rows processed")
+
+
+def test_no_wikitext_columns():
+    """Test that neither external_links nor citations columns exist without flags."""
+    tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet")
+
+    try:
+        tester.call_wikiq("--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+
+    # Verify neither column exists
+    assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag"
+    assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
+
+    print(f"No wikitext columns test passed! {len(test)} rows processed")