extract wikilinks.
This commit is contained in:
@@ -831,3 +831,44 @@ def test_no_wikitext_columns():
|
||||
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
|
||||
|
||||
print(f"No wikitext columns test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_wikilinks():
|
||||
"""Test that --wikilinks extracts internal wikilinks correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "wikilinks", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--wikilinks", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify wikilinks column exists
|
||||
assert "wikilinks" in test.columns, "wikilinks column should exist"
|
||||
|
||||
# Verify column has list/array type
|
||||
assert test["wikilinks"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
|
||||
|
||||
# Verify extraction matches mwparserfromhell for sample rows
|
||||
rows_with_links = test[test["wikilinks"].apply(lambda x: x is not None and len(x) > 0)]
|
||||
if len(rows_with_links) > 0:
|
||||
sample = rows_with_links.head(5)
|
||||
for idx, row in sample.iterrows():
|
||||
text = row["text"]
|
||||
if text:
|
||||
wikicode = mwparserfromhell.parse(text)
|
||||
expected = []
|
||||
for link in wikicode.filter_wikilinks():
|
||||
title = str(link.title).strip()
|
||||
display_text = str(link.text).strip() if link.text else None
|
||||
expected.append({"title": title, "text": display_text})
|
||||
|
||||
actual = list(row["wikilinks"])
|
||||
# Convert to comparable format (pandas may read as dicts or named tuples)
|
||||
actual_dicts = [{"title": item["title"], "text": item["text"]} for item in actual]
|
||||
assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch"
|
||||
|
||||
print(f"Wikilinks test passed! {len(test)} rows processed")
|
||||
|
||||
Reference in New Issue
Block a user