Start working on adding columns from mwparserfromhell.

This commit is contained in:
Nathan TeBlunthuis
2025-12-02 12:26:03 -08:00
parent b46f98a875
commit 76626a2785
5 changed files with 223 additions and 1 deletions

View File

@@ -27,6 +27,7 @@ import asyncio
import wikiq.tables as tables
from wikiq.tables import RevisionTable
from wikiq.wiki_diff_matcher import WikiDiffMatcher
from wikiq.wikitext_parser import WikitextParser
TO_ENCODE = ("title", "editor")
PERSISTENCE_RADIUS = 7
@@ -242,6 +243,8 @@ class WikiqParser:
batch_size: int = 1024,
partition_namespaces: bool = False,
resume_from_revid: int = None,
external_links: bool = False,
citations: bool = False,
):
"""
Parameters:
@@ -258,6 +261,8 @@ class WikiqParser:
self.text = text
self.partition_namespaces = partition_namespaces
self.resume_from_revid = resume_from_revid
self.external_links = external_links
self.citations = citations
if namespaces is not None:
self.namespace_filter = set(namespaces)
else:
@@ -397,6 +402,16 @@ class WikiqParser:
if self.collapse_user:
table.columns.append(tables.RevisionCollapsed())
# Create shared parser if either wikitext feature is enabled
if self.external_links or self.citations:
wikitext_parser = WikitextParser()
if self.external_links:
table.columns.append(tables.RevisionExternalLinks(wikitext_parser))
if self.citations:
table.columns.append(tables.RevisionCitations(wikitext_parser))
# extract list of namespaces
self.namespaces = {
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
@@ -1135,6 +1150,22 @@ def main():
help="Output the text of the revision.",
)
parser.add_argument(
"--external-links",
dest="external_links",
action="store_true",
default=False,
help="Extract external links from each revision using mwparserfromhell.",
)
parser.add_argument(
"--citations",
dest="citations",
action="store_true",
default=False,
help="Extract citations (ref tags and cite templates) from each revision.",
)
parser.add_argument(
"-PNS",
"--partition-namespaces",
@@ -1239,6 +1270,8 @@ def main():
partition_namespaces=args.partition_namespaces,
batch_size=args.batch_size,
resume_from_revid=resume_from_revid,
external_links=args.external_links,
citations=args.citations,
)
wikiq.process()
@@ -1266,6 +1299,8 @@ def main():
text=args.text,
batch_size=args.batch_size,
resume_from_revid=None,
external_links=args.external_links,
citations=args.citations,
)
wikiq.process()

View File

@@ -2,7 +2,7 @@ import sys
from abc import abstractmethod, ABC
from datetime import datetime, timezone
from hashlib import sha1
from typing import Generic, TypeVar, Union
from typing import Generic, TypeVar, Union, TYPE_CHECKING
import mwreverts
import mwtypes
@@ -10,6 +10,9 @@ import mwxml
import pyarrow as pa
if TYPE_CHECKING:
from wikiq.wikitext_parser import WikitextParser
T = TypeVar('T')
@@ -217,3 +220,35 @@ class RevisionText(RevisionField[str]):
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
revision = revisions[-1]
return revision.text
class RevisionExternalLinks(RevisionField[Union[list[str], None]]):
"""Extract all external links from revision text."""
field = pa.field("external_links", pa.list_(pa.string()), nullable=True)
def __init__(self, wikitext_parser: "WikitextParser"):
super().__init__()
self.wikitext_parser = wikitext_parser
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
revision = revisions[-1]
if revision.deleted.text:
return None
return self.wikitext_parser.extract_external_links(revision.text)
class RevisionCitations(RevisionField[Union[list[str], None]]):
"""Extract citations from ref tags and cite templates."""
field = pa.field("citations", pa.list_(pa.string()), nullable=True)
def __init__(self, wikitext_parser: "WikitextParser"):
super().__init__()
self.wikitext_parser = wikitext_parser
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
revision = revisions[-1]
if revision.deleted.text:
return None
return self.wikitext_parser.extract_citations(revision.text)

View File

@@ -0,0 +1,61 @@
"""Shared wikitext parser with caching to avoid duplicate parsing."""
from __future__ import annotations
import mwparserfromhell
class WikitextParser:
"""Caches parsed wikicode to avoid re-parsing the same text."""
CITE_TEMPLATES = {
'cite web', 'cite book', 'cite journal', 'cite news',
'cite magazine', 'cite conference', 'cite encyclopedia',
'cite report', 'cite thesis', 'cite press release',
'citation', 'sfn', 'harvnb', 'harv'
}
def __init__(self):
self._cached_text: str | None = None
self._cached_wikicode = None
def _get_wikicode(self, text: str):
"""Parse text and cache result. Returns cached result if text unchanged."""
if text != self._cached_text:
self._cached_text = text
self._cached_wikicode = mwparserfromhell.parse(text)
return self._cached_wikicode
def extract_external_links(self, text: str | None) -> list[str] | None:
"""Extract all external link URLs from wikitext."""
if text is None:
return None
try:
wikicode = self._get_wikicode(text)
return [str(link.url) for link in wikicode.filter_external_links()]
except Exception:
return None
def extract_citations(self, text: str | None) -> list[str] | None:
"""Extract citations from ref tags and cite templates."""
if text is None:
return None
try:
wikicode = self._get_wikicode(text)
citations = []
# Extract ref tag contents
for tag in wikicode.filter_tags():
if tag.tag.lower() == 'ref':
content = str(tag.contents).strip()
if content:
citations.append(f"ref:{content}")
# Extract cite templates
for template in wikicode.filter_templates():
template_name = str(template.name).strip().lower()
if any(template_name.startswith(cite) for cite in self.CITE_TEMPLATES):
citations.append(f"template:{str(template)}")
return citations
except Exception:
return None