improve tests.

This commit is contained in:
Nathan TeBlunthuis 2025-12-02 13:52:12 -08:00
parent 76626a2785
commit 329341efb6

View File

@ -633,11 +633,14 @@ def test_resume_with_partition_namespaces():
def test_external_links_only(): def test_external_links_only():
"""Test that --external-links extracts external links without --citations.""" """Test that --external-links extracts external links correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet") tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
try: try:
tester.call_wikiq("--external-links", "--fandom-2020") # Also include --text so we can verify extraction against actual wikitext
tester.call_wikiq("--external-links", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
@ -653,15 +656,46 @@ def test_external_links_only():
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"external_links should be a list/array type or None" "external_links should be a list/array type or None"
print(f"External links only test passed! {len(test)} rows processed") # Verify that extracted URLs look like valid URIs (have a scheme or are protocol-relative)
all_urls = []
for links in test["external_links"]:
if links is not None and len(links) > 0:
all_urls.extend(links)
for url in all_urls:
# External links can be http, https, mailto, ftp, etc. or protocol-relative (//)
has_scheme = ":" in url and url.index(":") < 10 # scheme:... with short scheme
is_protocol_relative = url.startswith("//")
assert has_scheme or is_protocol_relative, \
f"External link should be a valid URI, got: {url}"
# Verify extraction matches mwparserfromhell for a sample of rows with text
rows_with_links = test[test["external_links"].apply(lambda x: x is not None and len(x) > 0)]
if len(rows_with_links) > 0:
# Test up to 5 rows
sample = rows_with_links.head(5)
for idx, row in sample.iterrows():
text = row["text"]
if text:
wikicode = mwparserfromhell.parse(text)
expected_links = [str(link.url) for link in wikicode.filter_external_links()]
actual_links = list(row["external_links"])
assert actual_links == expected_links, \
f"Row {idx}: external_links mismatch. Expected {expected_links}, got {actual_links}"
print(f"External links only test passed! {len(test)} rows, {len(all_urls)} total URLs extracted")
def test_citations_only(): def test_citations_only():
"""Test that --citations extracts citations without --external-links.""" """Test that --citations extracts citations correctly."""
import mwparserfromhell
from wikiq.wikitext_parser import WikitextParser
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet") tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
try: try:
tester.call_wikiq("--citations", "--fandom-2020") # Also include --text so we can verify extraction against actual wikitext
tester.call_wikiq("--citations", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
@ -677,15 +711,43 @@ def test_citations_only():
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"citations should be a list/array type or None" "citations should be a list/array type or None"
print(f"Citations only test passed! {len(test)} rows processed") # Verify that extracted citations have correct prefixes (ref: or template:)
all_citations = []
for citations in test["citations"]:
if citations is not None and len(citations) > 0:
all_citations.extend(citations)
for citation in all_citations:
assert citation.startswith("ref:") or citation.startswith("template:"), \
f"Citation should start with 'ref:' or 'template:', got: {citation}"
# Verify extraction matches WikitextParser for a sample of rows with text
rows_with_citations = test[test["citations"].apply(lambda x: x is not None and len(x) > 0)]
if len(rows_with_citations) > 0:
parser = WikitextParser()
# Test up to 5 rows
sample = rows_with_citations.head(5)
for idx, row in sample.iterrows():
text = row["text"]
if text:
expected_citations = parser.extract_citations(text)
actual_citations = list(row["citations"])
assert actual_citations == expected_citations, \
f"Row {idx}: citations mismatch. Expected {expected_citations}, got {actual_citations}"
print(f"Citations only test passed! {len(test)} rows, {len(all_citations)} total citations extracted")
def test_external_links_and_citations(): def test_external_links_and_citations():
"""Test that both --external-links and --citations work together (shared parser).""" """Test that both --external-links and --citations work together (shared parser)."""
import mwparserfromhell
from wikiq.wikitext_parser import WikitextParser
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet") tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
try: try:
tester.call_wikiq("--external-links", "--citations", "--fandom-2020") # Also include --text so we can verify extraction against actual wikitext
tester.call_wikiq("--external-links", "--citations", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc: except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8")) pytest.fail(exc.stderr.decode("utf8"))
@ -701,7 +763,56 @@ def test_external_links_and_citations():
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \ assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
"citations should be a list/array type or None" "citations should be a list/array type or None"
print(f"External links and citations test passed! {len(test)} rows processed") # Verify URLs look like valid URIs (have a scheme or are protocol-relative)
all_urls = []
for links in test["external_links"]:
if links is not None and len(links) > 0:
all_urls.extend(links)
for url in all_urls:
# External links can be http, https, mailto, ftp, etc. or protocol-relative (//)
has_scheme = ":" in url and url.index(":") < 10 # scheme:... with short scheme
is_protocol_relative = url.startswith("//")
assert has_scheme or is_protocol_relative, \
f"External link should be a valid URI, got: {url}"
# Verify citations have correct prefixes
all_citations = []
for citations in test["citations"]:
if citations is not None and len(citations) > 0:
all_citations.extend(citations)
for citation in all_citations:
assert citation.startswith("ref:") or citation.startswith("template:"), \
f"Citation should start with 'ref:' or 'template:', got: {citation}"
# Verify extraction matches WikitextParser for a sample of rows with text
# This tests that the shared parser optimization works correctly
parser = WikitextParser()
rows_with_content = test[
(test["external_links"].apply(lambda x: x is not None and len(x) > 0)) |
(test["citations"].apply(lambda x: x is not None and len(x) > 0))
]
if len(rows_with_content) > 0:
# Test up to 5 rows
sample = rows_with_content.head(5)
for idx, row in sample.iterrows():
text = row["text"]
if text:
# Verify external links
wikicode = mwparserfromhell.parse(text)
expected_links = [str(link.url) for link in wikicode.filter_external_links()]
actual_links = list(row["external_links"]) if row["external_links"] is not None else []
assert actual_links == expected_links, \
f"Row {idx}: external_links mismatch. Expected {expected_links}, got {actual_links}"
# Verify citations
expected_citations = parser.extract_citations(text)
actual_citations = list(row["citations"]) if row["citations"] is not None else []
assert actual_citations == expected_citations, \
f"Row {idx}: citations mismatch. Expected {expected_citations}, got {actual_citations}"
print(f"External links and citations test passed! {len(test)} rows, {len(all_urls)} URLs, {len(all_citations)} citations")
def test_no_wikitext_columns(): def test_no_wikitext_columns():