improve tests.
This commit is contained in:
parent
76626a2785
commit
329341efb6
@ -633,11 +633,14 @@ def test_resume_with_partition_namespaces():
|
||||
|
||||
|
||||
def test_external_links_only():
|
||||
"""Test that --external-links extracts external links without --citations."""
|
||||
"""Test that --external-links extracts external links correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--external-links", "--fandom-2020")
|
||||
# Also include --text so we can verify extraction against actual wikitext
|
||||
tester.call_wikiq("--external-links", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -653,15 +656,46 @@ def test_external_links_only():
|
||||
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"external_links should be a list/array type or None"
|
||||
|
||||
print(f"External links only test passed! {len(test)} rows processed")
|
||||
# Verify that extracted URLs look like valid URIs (have a scheme or are protocol-relative)
|
||||
all_urls = []
|
||||
for links in test["external_links"]:
|
||||
if links is not None and len(links) > 0:
|
||||
all_urls.extend(links)
|
||||
|
||||
for url in all_urls:
|
||||
# External links can be http, https, mailto, ftp, etc. or protocol-relative (//)
|
||||
has_scheme = ":" in url and url.index(":") < 10 # scheme:... with short scheme
|
||||
is_protocol_relative = url.startswith("//")
|
||||
assert has_scheme or is_protocol_relative, \
|
||||
f"External link should be a valid URI, got: {url}"
|
||||
|
||||
# Verify extraction matches mwparserfromhell for a sample of rows with text
|
||||
rows_with_links = test[test["external_links"].apply(lambda x: x is not None and len(x) > 0)]
|
||||
if len(rows_with_links) > 0:
|
||||
# Test up to 5 rows
|
||||
sample = rows_with_links.head(5)
|
||||
for idx, row in sample.iterrows():
|
||||
text = row["text"]
|
||||
if text:
|
||||
wikicode = mwparserfromhell.parse(text)
|
||||
expected_links = [str(link.url) for link in wikicode.filter_external_links()]
|
||||
actual_links = list(row["external_links"])
|
||||
assert actual_links == expected_links, \
|
||||
f"Row {idx}: external_links mismatch. Expected {expected_links}, got {actual_links}"
|
||||
|
||||
print(f"External links only test passed! {len(test)} rows, {len(all_urls)} total URLs extracted")
|
||||
|
||||
|
||||
def test_citations_only():
|
||||
"""Test that --citations extracts citations without --external-links."""
|
||||
"""Test that --citations extracts citations correctly."""
|
||||
import mwparserfromhell
|
||||
from wikiq.wikitext_parser import WikitextParser
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--citations", "--fandom-2020")
|
||||
# Also include --text so we can verify extraction against actual wikitext
|
||||
tester.call_wikiq("--citations", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -677,15 +711,43 @@ def test_citations_only():
|
||||
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"citations should be a list/array type or None"
|
||||
|
||||
print(f"Citations only test passed! {len(test)} rows processed")
|
||||
# Verify that extracted citations have correct prefixes (ref: or template:)
|
||||
all_citations = []
|
||||
for citations in test["citations"]:
|
||||
if citations is not None and len(citations) > 0:
|
||||
all_citations.extend(citations)
|
||||
|
||||
for citation in all_citations:
|
||||
assert citation.startswith("ref:") or citation.startswith("template:"), \
|
||||
f"Citation should start with 'ref:' or 'template:', got: {citation}"
|
||||
|
||||
# Verify extraction matches WikitextParser for a sample of rows with text
|
||||
rows_with_citations = test[test["citations"].apply(lambda x: x is not None and len(x) > 0)]
|
||||
if len(rows_with_citations) > 0:
|
||||
parser = WikitextParser()
|
||||
# Test up to 5 rows
|
||||
sample = rows_with_citations.head(5)
|
||||
for idx, row in sample.iterrows():
|
||||
text = row["text"]
|
||||
if text:
|
||||
expected_citations = parser.extract_citations(text)
|
||||
actual_citations = list(row["citations"])
|
||||
assert actual_citations == expected_citations, \
|
||||
f"Row {idx}: citations mismatch. Expected {expected_citations}, got {actual_citations}"
|
||||
|
||||
print(f"Citations only test passed! {len(test)} rows, {len(all_citations)} total citations extracted")
|
||||
|
||||
|
||||
def test_external_links_and_citations():
|
||||
"""Test that both --external-links and --citations work together (shared parser)."""
|
||||
import mwparserfromhell
|
||||
from wikiq.wikitext_parser import WikitextParser
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--external-links", "--citations", "--fandom-2020")
|
||||
# Also include --text so we can verify extraction against actual wikitext
|
||||
tester.call_wikiq("--external-links", "--citations", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
@ -701,7 +763,56 @@ def test_external_links_and_citations():
|
||||
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"citations should be a list/array type or None"
|
||||
|
||||
print(f"External links and citations test passed! {len(test)} rows processed")
|
||||
# Verify URLs look like valid URIs (have a scheme or are protocol-relative)
|
||||
all_urls = []
|
||||
for links in test["external_links"]:
|
||||
if links is not None and len(links) > 0:
|
||||
all_urls.extend(links)
|
||||
|
||||
for url in all_urls:
|
||||
# External links can be http, https, mailto, ftp, etc. or protocol-relative (//)
|
||||
has_scheme = ":" in url and url.index(":") < 10 # scheme:... with short scheme
|
||||
is_protocol_relative = url.startswith("//")
|
||||
assert has_scheme or is_protocol_relative, \
|
||||
f"External link should be a valid URI, got: {url}"
|
||||
|
||||
# Verify citations have correct prefixes
|
||||
all_citations = []
|
||||
for citations in test["citations"]:
|
||||
if citations is not None and len(citations) > 0:
|
||||
all_citations.extend(citations)
|
||||
|
||||
for citation in all_citations:
|
||||
assert citation.startswith("ref:") or citation.startswith("template:"), \
|
||||
f"Citation should start with 'ref:' or 'template:', got: {citation}"
|
||||
|
||||
# Verify extraction matches WikitextParser for a sample of rows with text
|
||||
# This tests that the shared parser optimization works correctly
|
||||
parser = WikitextParser()
|
||||
rows_with_content = test[
|
||||
(test["external_links"].apply(lambda x: x is not None and len(x) > 0)) |
|
||||
(test["citations"].apply(lambda x: x is not None and len(x) > 0))
|
||||
]
|
||||
if len(rows_with_content) > 0:
|
||||
# Test up to 5 rows
|
||||
sample = rows_with_content.head(5)
|
||||
for idx, row in sample.iterrows():
|
||||
text = row["text"]
|
||||
if text:
|
||||
# Verify external links
|
||||
wikicode = mwparserfromhell.parse(text)
|
||||
expected_links = [str(link.url) for link in wikicode.filter_external_links()]
|
||||
actual_links = list(row["external_links"]) if row["external_links"] is not None else []
|
||||
assert actual_links == expected_links, \
|
||||
f"Row {idx}: external_links mismatch. Expected {expected_links}, got {actual_links}"
|
||||
|
||||
# Verify citations
|
||||
expected_citations = parser.extract_citations(text)
|
||||
actual_citations = list(row["citations"]) if row["citations"] is not None else []
|
||||
assert actual_citations == expected_citations, \
|
||||
f"Row {idx}: citations mismatch. Expected {expected_citations}, got {actual_citations}"
|
||||
|
||||
print(f"External links and citations test passed! {len(test)} rows, {len(all_urls)} URLs, {len(all_citations)} citations")
|
||||
|
||||
|
||||
def test_no_wikitext_columns():
|
||||
|
||||
Loading…
Reference in New Issue
Block a user