Start working on adding columns from mwparserfromhell.
This commit is contained in:
parent
b46f98a875
commit
76626a2785
@ -8,6 +8,7 @@ dependencies = [
|
|||||||
"deltas>=0.7.0",
|
"deltas>=0.7.0",
|
||||||
"mediawiki-utilities>=0.4.18",
|
"mediawiki-utilities>=0.4.18",
|
||||||
"more-itertools>=10.7.0",
|
"more-itertools>=10.7.0",
|
||||||
|
"mwparserfromhell>=0.6.0",
|
||||||
"mwpersistence>=0.2.4",
|
"mwpersistence>=0.2.4",
|
||||||
"mwreverts>=0.1.5",
|
"mwreverts>=0.1.5",
|
||||||
"mwtypes>=0.4.0",
|
"mwtypes>=0.4.0",
|
||||||
|
|||||||
@ -27,6 +27,7 @@ import asyncio
|
|||||||
import wikiq.tables as tables
|
import wikiq.tables as tables
|
||||||
from wikiq.tables import RevisionTable
|
from wikiq.tables import RevisionTable
|
||||||
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
from wikiq.wiki_diff_matcher import WikiDiffMatcher
|
||||||
|
from wikiq.wikitext_parser import WikitextParser
|
||||||
|
|
||||||
TO_ENCODE = ("title", "editor")
|
TO_ENCODE = ("title", "editor")
|
||||||
PERSISTENCE_RADIUS = 7
|
PERSISTENCE_RADIUS = 7
|
||||||
@ -242,6 +243,8 @@ class WikiqParser:
|
|||||||
batch_size: int = 1024,
|
batch_size: int = 1024,
|
||||||
partition_namespaces: bool = False,
|
partition_namespaces: bool = False,
|
||||||
resume_from_revid: int = None,
|
resume_from_revid: int = None,
|
||||||
|
external_links: bool = False,
|
||||||
|
citations: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Parameters:
|
Parameters:
|
||||||
@ -258,6 +261,8 @@ class WikiqParser:
|
|||||||
self.text = text
|
self.text = text
|
||||||
self.partition_namespaces = partition_namespaces
|
self.partition_namespaces = partition_namespaces
|
||||||
self.resume_from_revid = resume_from_revid
|
self.resume_from_revid = resume_from_revid
|
||||||
|
self.external_links = external_links
|
||||||
|
self.citations = citations
|
||||||
if namespaces is not None:
|
if namespaces is not None:
|
||||||
self.namespace_filter = set(namespaces)
|
self.namespace_filter = set(namespaces)
|
||||||
else:
|
else:
|
||||||
@ -397,6 +402,16 @@ class WikiqParser:
|
|||||||
if self.collapse_user:
|
if self.collapse_user:
|
||||||
table.columns.append(tables.RevisionCollapsed())
|
table.columns.append(tables.RevisionCollapsed())
|
||||||
|
|
||||||
|
# Create shared parser if either wikitext feature is enabled
|
||||||
|
if self.external_links or self.citations:
|
||||||
|
wikitext_parser = WikitextParser()
|
||||||
|
|
||||||
|
if self.external_links:
|
||||||
|
table.columns.append(tables.RevisionExternalLinks(wikitext_parser))
|
||||||
|
|
||||||
|
if self.citations:
|
||||||
|
table.columns.append(tables.RevisionCitations(wikitext_parser))
|
||||||
|
|
||||||
# extract list of namespaces
|
# extract list of namespaces
|
||||||
self.namespaces = {
|
self.namespaces = {
|
||||||
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
|
ns.name: ns.id for ns in dump.mwiterator.site_info.namespaces
|
||||||
@ -1135,6 +1150,22 @@ def main():
|
|||||||
help="Output the text of the revision.",
|
help="Output the text of the revision.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--external-links",
|
||||||
|
dest="external_links",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Extract external links from each revision using mwparserfromhell.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--citations",
|
||||||
|
dest="citations",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Extract citations (ref tags and cite templates) from each revision.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-PNS",
|
"-PNS",
|
||||||
"--partition-namespaces",
|
"--partition-namespaces",
|
||||||
@ -1239,6 +1270,8 @@ def main():
|
|||||||
partition_namespaces=args.partition_namespaces,
|
partition_namespaces=args.partition_namespaces,
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
resume_from_revid=resume_from_revid,
|
resume_from_revid=resume_from_revid,
|
||||||
|
external_links=args.external_links,
|
||||||
|
citations=args.citations,
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
@ -1266,6 +1299,8 @@ def main():
|
|||||||
text=args.text,
|
text=args.text,
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
resume_from_revid=None,
|
resume_from_revid=None,
|
||||||
|
external_links=args.external_links,
|
||||||
|
citations=args.citations,
|
||||||
)
|
)
|
||||||
|
|
||||||
wikiq.process()
|
wikiq.process()
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import sys
|
|||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
from typing import Generic, TypeVar, Union
|
from typing import Generic, TypeVar, Union, TYPE_CHECKING
|
||||||
|
|
||||||
import mwreverts
|
import mwreverts
|
||||||
import mwtypes
|
import mwtypes
|
||||||
@ -10,6 +10,9 @@ import mwxml
|
|||||||
|
|
||||||
import pyarrow as pa
|
import pyarrow as pa
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from wikiq.wikitext_parser import WikitextParser
|
||||||
|
|
||||||
T = TypeVar('T')
|
T = TypeVar('T')
|
||||||
|
|
||||||
|
|
||||||
@ -217,3 +220,35 @@ class RevisionText(RevisionField[str]):
|
|||||||
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> str:
|
||||||
revision = revisions[-1]
|
revision = revisions[-1]
|
||||||
return revision.text
|
return revision.text
|
||||||
|
|
||||||
|
|
||||||
|
class RevisionExternalLinks(RevisionField[Union[list[str], None]]):
|
||||||
|
"""Extract all external links from revision text."""
|
||||||
|
|
||||||
|
field = pa.field("external_links", pa.list_(pa.string()), nullable=True)
|
||||||
|
|
||||||
|
def __init__(self, wikitext_parser: "WikitextParser"):
|
||||||
|
super().__init__()
|
||||||
|
self.wikitext_parser = wikitext_parser
|
||||||
|
|
||||||
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
|
||||||
|
revision = revisions[-1]
|
||||||
|
if revision.deleted.text:
|
||||||
|
return None
|
||||||
|
return self.wikitext_parser.extract_external_links(revision.text)
|
||||||
|
|
||||||
|
|
||||||
|
class RevisionCitations(RevisionField[Union[list[str], None]]):
|
||||||
|
"""Extract citations from ref tags and cite templates."""
|
||||||
|
|
||||||
|
field = pa.field("citations", pa.list_(pa.string()), nullable=True)
|
||||||
|
|
||||||
|
def __init__(self, wikitext_parser: "WikitextParser"):
|
||||||
|
super().__init__()
|
||||||
|
self.wikitext_parser = wikitext_parser
|
||||||
|
|
||||||
|
def extract(self, page: mwtypes.Page, revisions: list[mwxml.Revision]) -> Union[list[str], None]:
|
||||||
|
revision = revisions[-1]
|
||||||
|
if revision.deleted.text:
|
||||||
|
return None
|
||||||
|
return self.wikitext_parser.extract_citations(revision.text)
|
||||||
|
|||||||
61
src/wikiq/wikitext_parser.py
Normal file
61
src/wikiq/wikitext_parser.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
"""Shared wikitext parser with caching to avoid duplicate parsing."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import mwparserfromhell
|
||||||
|
|
||||||
|
|
||||||
|
class WikitextParser:
|
||||||
|
"""Caches parsed wikicode to avoid re-parsing the same text."""
|
||||||
|
|
||||||
|
CITE_TEMPLATES = {
|
||||||
|
'cite web', 'cite book', 'cite journal', 'cite news',
|
||||||
|
'cite magazine', 'cite conference', 'cite encyclopedia',
|
||||||
|
'cite report', 'cite thesis', 'cite press release',
|
||||||
|
'citation', 'sfn', 'harvnb', 'harv'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._cached_text: str | None = None
|
||||||
|
self._cached_wikicode = None
|
||||||
|
|
||||||
|
def _get_wikicode(self, text: str):
|
||||||
|
"""Parse text and cache result. Returns cached result if text unchanged."""
|
||||||
|
if text != self._cached_text:
|
||||||
|
self._cached_text = text
|
||||||
|
self._cached_wikicode = mwparserfromhell.parse(text)
|
||||||
|
return self._cached_wikicode
|
||||||
|
|
||||||
|
def extract_external_links(self, text: str | None) -> list[str] | None:
|
||||||
|
"""Extract all external link URLs from wikitext."""
|
||||||
|
if text is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
wikicode = self._get_wikicode(text)
|
||||||
|
return [str(link.url) for link in wikicode.filter_external_links()]
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_citations(self, text: str | None) -> list[str] | None:
|
||||||
|
"""Extract citations from ref tags and cite templates."""
|
||||||
|
if text is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
wikicode = self._get_wikicode(text)
|
||||||
|
citations = []
|
||||||
|
|
||||||
|
# Extract ref tag contents
|
||||||
|
for tag in wikicode.filter_tags():
|
||||||
|
if tag.tag.lower() == 'ref':
|
||||||
|
content = str(tag.contents).strip()
|
||||||
|
if content:
|
||||||
|
citations.append(f"ref:{content}")
|
||||||
|
|
||||||
|
# Extract cite templates
|
||||||
|
for template in wikicode.filter_templates():
|
||||||
|
template_name = str(template.name).strip().lower()
|
||||||
|
if any(template_name.startswith(cite) for cite in self.CITE_TEMPLATES):
|
||||||
|
citations.append(f"template:{str(template)}")
|
||||||
|
|
||||||
|
return citations
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
@ -630,3 +630,93 @@ def test_resume_with_partition_namespaces():
|
|||||||
assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
|
assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
|
||||||
|
|
||||||
print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
|
print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
|
||||||
|
|
||||||
|
|
||||||
|
def test_external_links_only():
|
||||||
|
"""Test that --external-links extracts external links without --citations."""
|
||||||
|
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--external-links", "--fandom-2020")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
pytest.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||||
|
|
||||||
|
# Verify external_links column exists
|
||||||
|
assert "external_links" in test.columns, "external_links column should exist"
|
||||||
|
|
||||||
|
# Verify citations column does NOT exist
|
||||||
|
assert "citations" not in test.columns, "citations column should NOT exist when only --external-links is used"
|
||||||
|
|
||||||
|
# Verify column has list/array type (pandas reads parquet lists as numpy arrays)
|
||||||
|
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||||
|
"external_links should be a list/array type or None"
|
||||||
|
|
||||||
|
print(f"External links only test passed! {len(test)} rows processed")
|
||||||
|
|
||||||
|
|
||||||
|
def test_citations_only():
|
||||||
|
"""Test that --citations extracts citations without --external-links."""
|
||||||
|
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--citations", "--fandom-2020")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
pytest.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||||
|
|
||||||
|
# Verify citations column exists
|
||||||
|
assert "citations" in test.columns, "citations column should exist"
|
||||||
|
|
||||||
|
# Verify external_links column does NOT exist
|
||||||
|
assert "external_links" not in test.columns, "external_links column should NOT exist when only --citations is used"
|
||||||
|
|
||||||
|
# Verify column has list/array type (pandas reads parquet lists as numpy arrays)
|
||||||
|
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||||
|
"citations should be a list/array type or None"
|
||||||
|
|
||||||
|
print(f"Citations only test passed! {len(test)} rows processed")
|
||||||
|
|
||||||
|
|
||||||
|
def test_external_links_and_citations():
|
||||||
|
"""Test that both --external-links and --citations work together (shared parser)."""
|
||||||
|
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--external-links", "--citations", "--fandom-2020")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
pytest.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||||
|
|
||||||
|
# Verify both columns exist
|
||||||
|
assert "external_links" in test.columns, "external_links column should exist"
|
||||||
|
assert "citations" in test.columns, "citations column should exist"
|
||||||
|
|
||||||
|
# Verify both columns have list/array types (pandas reads parquet lists as numpy arrays)
|
||||||
|
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||||
|
"external_links should be a list/array type or None"
|
||||||
|
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||||
|
"citations should be a list/array type or None"
|
||||||
|
|
||||||
|
print(f"External links and citations test passed! {len(test)} rows processed")
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_wikitext_columns():
|
||||||
|
"""Test that neither external_links nor citations columns exist without flags."""
|
||||||
|
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet")
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.call_wikiq("--fandom-2020")
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
pytest.fail(exc.stderr.decode("utf8"))
|
||||||
|
|
||||||
|
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||||
|
|
||||||
|
# Verify neither column exists
|
||||||
|
assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag"
|
||||||
|
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
|
||||||
|
|
||||||
|
print(f"No wikitext columns test passed! {len(test)} rows processed")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user