add templates and headings to wikiq.

This commit is contained in:
Nathan TeBlunthuis
2025-12-02 17:51:08 -08:00
parent d3517ed5ca
commit 5ce9808b50
5 changed files with 239 additions and 6 deletions

View File

@@ -872,3 +872,94 @@ def test_wikilinks():
assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch"
print(f"Wikilinks test passed! {len(test)} rows processed")
def test_templates():
"""Test that --templates extracts templates correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--templates", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify templates column exists
assert "templates" in test.columns, "templates column should exist"
# Verify column has list/array type
assert test["templates"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
# Verify extraction matches mwparserfromhell for sample rows
rows_with_templates = test[test["templates"].apply(lambda x: x is not None and len(x) > 0)]
if len(rows_with_templates) > 0:
sample = rows_with_templates.head(5)
for idx, row in sample.iterrows():
text = row["text"]
if text:
wikicode = mwparserfromhell.parse(text)
expected = []
for template in wikicode.filter_templates():
name = str(template.name).strip()
params = {}
for param in template.params:
param_name = str(param.name).strip()
param_value = str(param.value).strip()
params[param_name] = param_value
expected.append({"name": name, "params": params})
actual = list(row["templates"])
# Convert to comparable format
actual_list = []
for item in actual:
actual_list.append({
"name": item["name"],
"params": dict(item["params"]) if item["params"] else {}
})
assert actual_list == expected, f"Row {idx}: templates mismatch"
print(f"Templates test passed! {len(test)} rows processed")
def test_headings():
"""Test that --headings extracts section headings correctly."""
import mwparserfromhell
tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="parquet")
try:
tester.call_wikiq("--headings", "--text", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
# Verify headings column exists
assert "headings" in test.columns, "headings column should exist"
# Verify column has list/array type
assert test["headings"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
# Verify extraction matches mwparserfromhell for sample rows
rows_with_headings = test[test["headings"].apply(lambda x: x is not None and len(x) > 0)]
if len(rows_with_headings) > 0:
sample = rows_with_headings.head(5)
for idx, row in sample.iterrows():
text = row["text"]
if text:
wikicode = mwparserfromhell.parse(text)
expected = []
for heading in wikicode.filter_headings():
level = heading.level
heading_text = str(heading.title).strip()
expected.append({"level": level, "text": heading_text})
actual = list(row["headings"])
# Convert to comparable format
actual_list = [{"level": item["level"], "text": item["text"]} for item in actual]
assert actual_list == expected, f"Row {idx}: headings mismatch"
print(f"Headings test passed! {len(test)} rows processed")