add templates and headings to wikiq.
This commit is contained in:
@@ -872,3 +872,94 @@ def test_wikilinks():
|
||||
assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch"
|
||||
|
||||
print(f"Wikilinks test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_templates():
|
||||
"""Test that --templates extracts templates correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--templates", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify templates column exists
|
||||
assert "templates" in test.columns, "templates column should exist"
|
||||
|
||||
# Verify column has list/array type
|
||||
assert test["templates"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
|
||||
|
||||
# Verify extraction matches mwparserfromhell for sample rows
|
||||
rows_with_templates = test[test["templates"].apply(lambda x: x is not None and len(x) > 0)]
|
||||
if len(rows_with_templates) > 0:
|
||||
sample = rows_with_templates.head(5)
|
||||
for idx, row in sample.iterrows():
|
||||
text = row["text"]
|
||||
if text:
|
||||
wikicode = mwparserfromhell.parse(text)
|
||||
expected = []
|
||||
for template in wikicode.filter_templates():
|
||||
name = str(template.name).strip()
|
||||
params = {}
|
||||
for param in template.params:
|
||||
param_name = str(param.name).strip()
|
||||
param_value = str(param.value).strip()
|
||||
params[param_name] = param_value
|
||||
expected.append({"name": name, "params": params})
|
||||
|
||||
actual = list(row["templates"])
|
||||
# Convert to comparable format
|
||||
actual_list = []
|
||||
for item in actual:
|
||||
actual_list.append({
|
||||
"name": item["name"],
|
||||
"params": dict(item["params"]) if item["params"] else {}
|
||||
})
|
||||
assert actual_list == expected, f"Row {idx}: templates mismatch"
|
||||
|
||||
print(f"Templates test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_headings():
|
||||
"""Test that --headings extracts section headings correctly."""
|
||||
import mwparserfromhell
|
||||
|
||||
tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--headings", "--text", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify headings column exists
|
||||
assert "headings" in test.columns, "headings column should exist"
|
||||
|
||||
# Verify column has list/array type
|
||||
assert test["headings"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
|
||||
|
||||
# Verify extraction matches mwparserfromhell for sample rows
|
||||
rows_with_headings = test[test["headings"].apply(lambda x: x is not None and len(x) > 0)]
|
||||
if len(rows_with_headings) > 0:
|
||||
sample = rows_with_headings.head(5)
|
||||
for idx, row in sample.iterrows():
|
||||
text = row["text"]
|
||||
if text:
|
||||
wikicode = mwparserfromhell.parse(text)
|
||||
expected = []
|
||||
for heading in wikicode.filter_headings():
|
||||
level = heading.level
|
||||
heading_text = str(heading.title).strip()
|
||||
expected.append({"level": level, "text": heading_text})
|
||||
|
||||
actual = list(row["headings"])
|
||||
# Convert to comparable format
|
||||
actual_list = [{"level": item["level"], "text": item["text"]} for item in actual]
|
||||
assert actual_list == expected, f"Row {idx}: headings mismatch"
|
||||
|
||||
print(f"Headings test passed! {len(test)} rows processed")
|
||||
|
||||
Reference in New Issue
Block a user