extract wikilinks.

This commit is contained in:
Nathan TeBlunthuis
2025-12-02 14:09:29 -08:00
parent 329341efb6
commit d3517ed5ca
4 changed files with 94 additions and 2 deletions

View File

@@ -831,3 +831,44 @@ def test_no_wikitext_columns():
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
print(f"No wikitext columns test passed! {len(test)} rows processed")
def test_wikilinks():
"""Test that --wikilinks extracts internal wikilinks correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--wikilinks", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify wikilinks column exists
assert "wikilinks" in test.columns, "wikilinks column should exist"
# Verify column has list/array type
assert test["wikilinks"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
# Verify extraction matches mwparserfromhell for sample rows
rows_with_links = test[test["wikilinks"].apply(lambda x: x is not None and len(x) > 0)]
if len(rows_with_links) > 0:
sample = rows_with_links.head(5)
for idx, row in sample.iterrows():
text = row["text"]
if text:
wikicode = mwparserfromhell.parse(text)
expected = []
for link in wikicode.filter_wikilinks():
title = str(link.title).strip()
display_text = str(link.text).strip() if link.text else None
expected.append({"title": title, "text": display_text})
actual = list(row["wikilinks"])
# Convert to comparable format (pandas may read as dicts or named tuples)
actual_dicts = [{"title": item["title"], "text": item["text"]} for item in actual]
assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch"
print(f"Wikilinks test passed! {len(test)} rows processed")