diff --git a/pyproject.toml b/pyproject.toml index 11bca06..bc3e709 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "deltas>=0.7.0", "mediawiki-utilities>=0.4.18", "more-itertools>=10.7.0", + "mwparserfromhell>=0.6.0", "mwpersistence>=0.2.4", "mwreverts>=0.1.5", "mwtypes>=0.4.0", diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 22ea3e2..e5a84f3 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -27,6 +27,7 @@ import asyncio import wikiq.tables as tables from wikiq.tables import RevisionTable from wikiq.wiki_diff_matcher import WikiDiffMatcher +from wikiq.wikitext_parser import WikitextParser TO_ENCODE = ("title", "editor") PERSISTENCE_RADIUS = 7 @@ -242,6 +243,8 @@ class WikiqParser: batch_size: int = 1024, partition_namespaces: bool = False, resume_from_revid: int = None, + external_links: bool = False, + citations: bool = False, ): """ Parameters: @@ -258,6 +261,8 @@ class WikiqParser: self.text = text self.partition_namespaces = partition_namespaces self.resume_from_revid = resume_from_revid + self.external_links = external_links + self.citations = citations if namespaces is not None: self.namespace_filter = set(namespaces) else: @@ -397,6 +402,16 @@ class WikiqParser: if self.collapse_user: table.columns.append(tables.RevisionCollapsed()) + # Create shared parser if either wikitext feature is enabled + if self.external_links or self.citations: + wikitext_parser = WikitextParser() + + if self.external_links: + table.columns.append(tables.RevisionExternalLinks(wikitext_parser)) + + if self.citations: + table.columns.append(tables.RevisionCitations(wikitext_parser)) + # extract list of namespaces self.namespaces = { ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces @@ -1135,6 +1150,22 @@ def main(): help="Output the text of the revision.", ) + parser.add_argument( + "--external-links", + dest="external_links", + action="store_true", + default=False, + help="Extract external links from each revision using mwparserfromhell.", + ) + + parser.add_argument( + "--citations", + dest="citations", + action="store_true", + default=False, + help="Extract citations (ref tags and cite templates) from each revision.", + ) + parser.add_argument( "-PNS", "--partition-namespaces", @@ -1239,6 +1270,8 @@ def main(): partition_namespaces=args.partition_namespaces, batch_size=args.batch_size, resume_from_revid=resume_from_revid, + external_links=args.external_links, + citations=args.citations, ) wikiq.process() @@ -1266,6 +1299,8 @@ def main(): text=args.text, batch_size=args.batch_size, resume_from_revid=None, + external_links=args.external_links, + citations=args.citations, ) wikiq.process() diff --git a/src/wikiq/tables.py b/src/wikiq/tables.py index 81faebc..fc3b730 100644 --- a/src/wikiq/tables.py +++ b/src/wikiq/tables.py @@ -2,7 +2,7 @@ import sys from abc import abstractmethod, ABC from datetime import datetime, timezone from hashlib import sha1 -from typing import Generic, TypeVar, Union +from typing import Generic, TypeVar, Union, TYPE_CHECKING import mwreverts import mwtypes @@ -10,6 +10,9 @@ import mwxml import pyarrow as pa +if TYPE_CHECKING: + from wikiq.wikitext_parser import WikitextParser + T = TypeVar('T') @@ -217,3 +220,35 @@ class RevisionText(RevisionField[str]): def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: revision = revisions[-1] return revision.text + + +class RevisionExternalLinks(RevisionField[Union[list[str], None]]): + """Extract all external links from revision text.""" + + field = pa.field("external_links", pa.list_(pa.string()), nullable=True) + + def __init__(self, wikitext_parser: "WikitextParser"): + super().__init__() + self.wikitext_parser = wikitext_parser + + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]: + revision = revisions[-1] + if revision.deleted.text: + return None + return self.wikitext_parser.extract_external_links(revision.text) + + +class RevisionCitations(RevisionField[Union[list[str], None]]): + """Extract citations from ref tags and cite templates.""" + + field = pa.field("citations", pa.list_(pa.string()), nullable=True) + + def __init__(self, wikitext_parser: "WikitextParser"): + super().__init__() + self.wikitext_parser = wikitext_parser + + def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]: + revision = revisions[-1] + if revision.deleted.text: + return None + return self.wikitext_parser.extract_citations(revision.text) diff --git a/src/wikiq/wikitext_parser.py b/src/wikiq/wikitext_parser.py new file mode 100644 index 0000000..a09e5da --- /dev/null +++ b/src/wikiq/wikitext_parser.py @@ -0,0 +1,61 @@ +"""Shared wikitext parser with caching to avoid duplicate parsing.""" +from __future__ import annotations + +import mwparserfromhell + + +class WikitextParser: + """Caches parsed wikicode to avoid re-parsing the same text.""" + + CITE_TEMPLATES = { + 'cite web', 'cite book', 'cite journal', 'cite news', + 'cite magazine', 'cite conference', 'cite encyclopedia', + 'cite report', 'cite thesis', 'cite press release', + 'citation', 'sfn', 'harvnb', 'harv' + } + + def __init__(self): + self._cached_text: str | None = None + self._cached_wikicode = None + + def _get_wikicode(self, text: str): + """Parse text and cache result. Returns cached result if text unchanged.""" + if text != self._cached_text: + self._cached_text = text + self._cached_wikicode = mwparserfromhell.parse(text) + return self._cached_wikicode + + def extract_external_links(self, text: str | None) -> list[str] | None: + """Extract all external link URLs from wikitext.""" + if text is None: + return None + try: + wikicode = self._get_wikicode(text) + return [str(link.url) for link in wikicode.filter_external_links()] + except Exception: + return None + + def extract_citations(self, text: str | None) -> list[str] | None: + """Extract citations from ref tags and cite templates.""" + if text is None: + return None + try: + wikicode = self._get_wikicode(text) + citations = [] + + # Extract ref tag contents + for tag in wikicode.filter_tags(): + if tag.tag.lower() == 'ref': + content = str(tag.contents).strip() + if content: + citations.append(f"ref:{content}") + + # Extract cite templates + for template in wikicode.filter_templates(): + template_name = str(template.name).strip().lower() + if any(template_name.startswith(cite) for cite in self.CITE_TEMPLATES): + citations.append(f"template:{str(template)}") + + return citations + except Exception: + return None diff --git a/test/Wikiq_Unit_Test.py b/test/Wikiq_Unit_Test.py index b351115..f22f35e 100644 --- a/test/Wikiq_Unit_Test.py +++ b/test/Wikiq_Unit_Test.py @@ -630,3 +630,93 @@ def test_resume_with_partition_namespaces(): assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}" print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions") + + +def test_external_links_only(): + """Test that --external-links extracts external links without --citations.""" + tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet") + + try: + tester.call_wikiq("--external-links", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + + # Verify external_links column exists + assert "external_links" in test.columns, "external_links column should exist" + + # Verify citations column does NOT exist + assert "citations" not in test.columns, "citations column should NOT exist when only --external-links is used" + + # Verify column has list/array type (pandas reads parquet lists as numpy arrays) + assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ + "external_links should be a list/array type or None" + + print(f"External links only test passed! {len(test)} rows processed") + + +def test_citations_only(): + """Test that --citations extracts citations without --external-links.""" + tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet") + + try: + tester.call_wikiq("--citations", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + + # Verify citations column exists + assert "citations" in test.columns, "citations column should exist" + + # Verify external_links column does NOT exist + assert "external_links" not in test.columns, "external_links column should NOT exist when only --citations is used" + + # Verify column has list/array type (pandas reads parquet lists as numpy arrays) + assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ + "citations should be a list/array type or None" + + print(f"Citations only test passed! {len(test)} rows processed") + + +def test_external_links_and_citations(): + """Test that both --external-links and --citations work together (shared parser).""" + tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet") + + try: + tester.call_wikiq("--external-links", "--citations", "--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + + # Verify both columns exist + assert "external_links" in test.columns, "external_links column should exist" + assert "citations" in test.columns, "citations column should exist" + + # Verify both columns have list/array types (pandas reads parquet lists as numpy arrays) + assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ + "external_links should be a list/array type or None" + assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ + "citations should be a list/array type or None" + + print(f"External links and citations test passed! {len(test)} rows processed") + + +def test_no_wikitext_columns(): + """Test that neither external_links nor citations columns exist without flags.""" + tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet") + + try: + tester.call_wikiq("--fandom-2020") + except subprocess.CalledProcessError as exc: + pytest.fail(exc.stderr.decode("utf8")) + + test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet") + + # Verify neither column exists + assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag" + assert "citations" not in test.columns, "citations column should NOT exist without --citations flag" + + print(f"No wikitext columns test passed! {len(test)} rows processed")