Start working on adding columns from mwparserfromhell.

This commit is contained in:
Nathan TeBlunthuis 2025-12-02 12:26:03 -08:00
parent b46f98a875
commit 76626a2785
5 changed files with 223 additions and 1 deletions

View File

@ -8,6 +8,7 @@ dependencies = [
"deltas>=0.7.0", "deltas>=0.7.0",
"mediawiki-utilities>=0.4.18", "mediawiki-utilities>=0.4.18",
"more-itertools>=10.7.0", "more-itertools>=10.7.0",
"mwparserfromhell>=0.6.0",
"mwpersistence>=0.2.4", "mwpersistence>=0.2.4",
"mwreverts>=0.1.5", "mwreverts>=0.1.5",
"mwtypes>=0.4.0", "mwtypes>=0.4.0",

View File

@ -27,6 +27,7 @@ import asyncio
import wikiq.tables as tables import wikiq.tables as tables
from wikiq.tables import RevisionTable from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher from wikiq.wiki_diff_matcher import WikiDiffMatcher
from wikiq.wikitext_parser import WikitextParser
TO_ENCODE = ("title", "editor") TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7 PERSISTENCE_RADIUS = 7
@ -242,6 +243,8 @@ class WikiqParser:
batch_size: int = 1024, batch_size: int = 1024,
partition_namespaces: bool = False, partition_namespaces: bool = False,
resume_from_revid: int = None, resume_from_revid: int = None,
external_links: bool = False,
citations: bool = False,
): ):
""" """
Parameters: Parameters:
@ -258,6 +261,8 @@ class WikiqParser:
self.text = text self.text = text
self.partition_namespaces = partition_namespaces self.partition_namespaces = partition_namespaces
self.resume_from_revid = resume_from_revid self.resume_from_revid = resume_from_revid
self.external_links = external_links
self.citations = citations
if namespaces is not None: if namespaces is not None:
self.namespace_filter = set(namespaces) self.namespace_filter = set(namespaces)
else: else:
@ -397,6 +402,16 @@ class WikiqParser:
if self.collapse_user: if self.collapse_user:
table.columns.append(tables.RevisionCollapsed()) table.columns.append(tables.RevisionCollapsed())
# Create shared parser if either wikitext feature is enabled
if self.external_links or self.citations:
wikitext_parser = WikitextParser()
if self.external_links:
table.columns.append(tables.RevisionExternalLinks(wikitext_parser))
if self.citations:
table.columns.append(tables.RevisionCitations(wikitext_parser))
# extract list of namespaces # extract list of namespaces
self.namespaces = { self.namespaces = {
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
@ -1135,6 +1150,22 @@ def main():
help="Output the text of the revision.", help="Output the text of the revision.",
) )
parser.add_argument(
"--external-links",
dest="external_links",
action="store_true",
default=False,
help="Extract external links from each revision using mwparserfromhell.",
)
parser.add_argument(
"--citations",
dest="citations",
action="store_true",
default=False,
help="Extract citations (ref tags and cite templates) from each revision.",
)
parser.add_argument( parser.add_argument(
"-PNS", "-PNS",
"--partition-namespaces", "--partition-namespaces",
@ -1239,6 +1270,8 @@ def main():
partition_namespaces=args.partition_namespaces, partition_namespaces=args.partition_namespaces,
batch_size=args.batch_size, batch_size=args.batch_size,
resume_from_revid=resume_from_revid, resume_from_revid=resume_from_revid,
external_links=args.external_links,
citations=args.citations,
) )
wikiq.process() wikiq.process()
@ -1266,6 +1299,8 @@ def main():
text=args.text, text=args.text,
batch_size=args.batch_size, batch_size=args.batch_size,
resume_from_revid=None, resume_from_revid=None,
external_links=args.external_links,
citations=args.citations,
) )
wikiq.process() wikiq.process()

View File

@ -2,7 +2,7 @@ import sys
from abc import abstractmethod, ABC from abc import abstractmethod, ABC
from datetime import datetime, timezone from datetime import datetime, timezone
from hashlib import sha1 from hashlib import sha1
from typing import Generic, TypeVar, Union from typing import Generic, TypeVar, Union, TYPE_CHECKING
import mwreverts import mwreverts
import mwtypes import mwtypes
@ -10,6 +10,9 @@ import mwxml
import pyarrow as pa import pyarrow as pa
if TYPE_CHECKING:
from wikiq.wikitext_parser import WikitextParser
T = TypeVar('T') T = TypeVar('T')
@ -217,3 +220,35 @@ class RevisionText(RevisionField[str]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str: def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1] revision = revisions[-1]
return revision.text return revision.text
class RevisionExternalLinks(RevisionField[Union[list[str], None]]):
"""Extract all external links from revision text."""
field = pa.field("external_links", pa.list_(pa.string()), nullable=True)
def __init__(self, wikitext_parser: "WikitextParser"):
super().__init__()
self.wikitext_parser = wikitext_parser
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
revision = revisions[-1]
if revision.deleted.text:
return None
return self.wikitext_parser.extract_external_links(revision.text)
class RevisionCitations(RevisionField[Union[list[str], None]]):
"""Extract citations from ref tags and cite templates."""
field = pa.field("citations", pa.list_(pa.string()), nullable=True)
def __init__(self, wikitext_parser: "WikitextParser"):
super().__init__()
self.wikitext_parser = wikitext_parser
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
revision = revisions[-1]
if revision.deleted.text:
return None
return self.wikitext_parser.extract_citations(revision.text)

View File

@ -0,0 +1,61 @@
"""Shared wikitext parser with caching to avoid duplicate parsing."""
from __future__ import annotations
import mwparserfromhell
class WikitextParser:
"""Caches parsed wikicode to avoid re-parsing the same text."""
CITE_TEMPLATES = {
'cite web', 'cite book', 'cite journal', 'cite news',
'cite magazine', 'cite conference', 'cite encyclopedia',
'cite report', 'cite thesis', 'cite press release',
'citation', 'sfn', 'harvnb', 'harv'
}
def __init__(self):
self._cached_text: str | None = None
self._cached_wikicode = None
def _get_wikicode(self, text: str):
"""Parse text and cache result. Returns cached result if text unchanged."""
if text != self._cached_text:
self._cached_text = text
self._cached_wikicode = mwparserfromhell.parse(text)
return self._cached_wikicode
def extract_external_links(self, text: str | None) -> list[str] | None:
"""Extract all external link URLs from wikitext."""
if text is None:
return None
try:
wikicode = self._get_wikicode(text)
return [str(link.url) for link in wikicode.filter_external_links()]
except Exception:
return None
def extract_citations(self, text: str | None) -> list[str] | None:
"""Extract citations from ref tags and cite templates."""
if text is None:
return None
try:
wikicode = self._get_wikicode(text)
citations = []
# Extract ref tag contents
for tag in wikicode.filter_tags():
if tag.tag.lower() == 'ref':
content = str(tag.contents).strip()
if content:
citations.append(f"ref:{content}")
# Extract cite templates
for template in wikicode.filter_templates():
template_name = str(template.name).strip().lower()
if any(template_name.startswith(cite) for cite in self.CITE_TEMPLATES):
citations.append(f"template:{str(template)}")
return citations
except Exception:
return None

View File

@ -630,3 +630,93 @@ def test_resume_with_partition_namespaces():
assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}" assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions") print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
def test_external_links_only():
"""Test that --external-links extracts external links without --citations."""
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--external-links", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify external_links column exists
assert "external_links" in test.columns, "external_links column should exist"
# Verify citations column does NOT exist
assert "citations" not in test.columns, "citations column should NOT exist when only --external-links is used"
# Verify column has list/array type (pandas reads parquet lists as numpy arrays)
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"external_links should be a list/array type or None"
print(f"External links only test passed! {len(test)} rows processed")
def test_citations_only():
"""Test that --citations extracts citations without --external-links."""
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--citations", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify citations column exists
assert "citations" in test.columns, "citations column should exist"
# Verify external_links column does NOT exist
assert "external_links" not in test.columns, "external_links column should NOT exist when only --citations is used"
# Verify column has list/array type (pandas reads parquet lists as numpy arrays)
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"citations should be a list/array type or None"
print(f"Citations only test passed! {len(test)} rows processed")
def test_external_links_and_citations():
"""Test that both --external-links and --citations work together (shared parser)."""
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--external-links", "--citations", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify both columns exist
assert "external_links" in test.columns, "external_links column should exist"
assert "citations" in test.columns, "citations column should exist"
# Verify both columns have list/array types (pandas reads parquet lists as numpy arrays)
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"external_links should be a list/array type or None"
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"citations should be a list/array type or None"
print(f"External links and citations test passed! {len(test)} rows processed")
def test_no_wikitext_columns():
"""Test that neither external_links nor citations columns exist without flags."""
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify neither column exists
assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag"
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
print(f"No wikitext columns test passed! {len(test)} rows processed")