add templates and headings to wikiq.

2025-12-02 17:51:08 -08:00
parent d3517ed5ca
commit 5ce9808b50
5 changed files with 239 additions and 6 deletions
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -872,3 +872,94 @@ def test_wikilinks():
                assert actual_dicts == expected, f"Row {idx}: wikilinks mismatch"

    print(f"Wikilinks test passed! {len(test)} rows processed")
+
+
+def test_templates():
+    """Test that --templates extracts templates correctly."""
+    import mwparserfromhell
+
+    tester = WikiqTester(SAILORMOON, "templates", in_compression="7z", out_format="parquet")
+
+    try:
+        tester.call_wikiq("--templates", "--text", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+
+    # Verify templates column exists
+    assert "templates" in test.columns, "templates column should exist"
+
+    # Verify column has list/array type
+    assert test["templates"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
+
+    # Verify extraction matches mwparserfromhell for sample rows
+    rows_with_templates = test[test["templates"].apply(lambda x: x is not None and len(x) > 0)]
+    if len(rows_with_templates) > 0:
+        sample = rows_with_templates.head(5)
+        for idx, row in sample.iterrows():
+            text = row["text"]
+            if text:
+                wikicode = mwparserfromhell.parse(text)
+                expected = []
+                for template in wikicode.filter_templates():
+                    name = str(template.name).strip()
+                    params = {}
+                    for param in template.params:
+                        param_name = str(param.name).strip()
+                        param_value = str(param.value).strip()
+                        params[param_name] = param_value
+                    expected.append({"name": name, "params": params})
+
+                actual = list(row["templates"])
+                # Convert to comparable format
+                actual_list = []
+                for item in actual:
+                    actual_list.append({
+                        "name": item["name"],
+                        "params": dict(item["params"]) if item["params"] else {}
+                    })
+                assert actual_list == expected, f"Row {idx}: templates mismatch"
+
+    print(f"Templates test passed! {len(test)} rows processed")
+
+
+def test_headings():
+    """Test that --headings extracts section headings correctly."""
+    import mwparserfromhell
+
+    tester = WikiqTester(SAILORMOON, "headings", in_compression="7z", out_format="parquet")
+
+    try:
+        tester.call_wikiq("--headings", "--text", "--fandom-2020")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
+
+    # Verify headings column exists
+    assert "headings" in test.columns, "headings column should exist"
+
+    # Verify column has list/array type
+    assert test["headings"].apply(lambda x: x is None or hasattr(x, '__len__')).all()
+
+    # Verify extraction matches mwparserfromhell for sample rows
+    rows_with_headings = test[test["headings"].apply(lambda x: x is not None and len(x) > 0)]
+    if len(rows_with_headings) > 0:
+        sample = rows_with_headings.head(5)
+        for idx, row in sample.iterrows():
+            text = row["text"]
+            if text:
+                wikicode = mwparserfromhell.parse(text)
+                expected = []
+                for heading in wikicode.filter_headings():
+                    level = heading.level
+                    heading_text = str(heading.title).strip()
+                    expected.append({"level": level, "text": heading_text})
+
+                actual = list(row["headings"])
+                # Convert to comparable format
+                actual_list = [{"level": item["level"], "text": item["text"]} for item in actual]
+                assert actual_list == expected, f"Row {idx}: headings mismatch"
+
+    print(f"Headings test passed! {len(test)} rows processed")