Start working on adding columns from mwparserfromhell.
This commit is contained in:
@@ -630,3 +630,93 @@ def test_resume_with_partition_namespaces():
|
||||
assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
|
||||
|
||||
print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
|
||||
|
||||
|
||||
def test_external_links_only():
|
||||
"""Test that --external-links extracts external links without --citations."""
|
||||
tester = WikiqTester(SAILORMOON, "external_links_only", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--external-links", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify external_links column exists
|
||||
assert "external_links" in test.columns, "external_links column should exist"
|
||||
|
||||
# Verify citations column does NOT exist
|
||||
assert "citations" not in test.columns, "citations column should NOT exist when only --external-links is used"
|
||||
|
||||
# Verify column has list/array type (pandas reads parquet lists as numpy arrays)
|
||||
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"external_links should be a list/array type or None"
|
||||
|
||||
print(f"External links only test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_citations_only():
|
||||
"""Test that --citations extracts citations without --external-links."""
|
||||
tester = WikiqTester(SAILORMOON, "citations_only", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--citations", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify citations column exists
|
||||
assert "citations" in test.columns, "citations column should exist"
|
||||
|
||||
# Verify external_links column does NOT exist
|
||||
assert "external_links" not in test.columns, "external_links column should NOT exist when only --citations is used"
|
||||
|
||||
# Verify column has list/array type (pandas reads parquet lists as numpy arrays)
|
||||
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"citations should be a list/array type or None"
|
||||
|
||||
print(f"Citations only test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_external_links_and_citations():
|
||||
"""Test that both --external-links and --citations work together (shared parser)."""
|
||||
tester = WikiqTester(SAILORMOON, "external_links_and_citations", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--external-links", "--citations", "--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify both columns exist
|
||||
assert "external_links" in test.columns, "external_links column should exist"
|
||||
assert "citations" in test.columns, "citations column should exist"
|
||||
|
||||
# Verify both columns have list/array types (pandas reads parquet lists as numpy arrays)
|
||||
assert test["external_links"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"external_links should be a list/array type or None"
|
||||
assert test["citations"].apply(lambda x: x is None or hasattr(x, '__len__')).all(), \
|
||||
"citations should be a list/array type or None"
|
||||
|
||||
print(f"External links and citations test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_no_wikitext_columns():
|
||||
"""Test that neither external_links nor citations columns exist without flags."""
|
||||
tester = WikiqTester(SAILORMOON, "no_wikitext_columns", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--fandom-2020")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
test = pd.read_parquet(tester.output + f"/{SAILORMOON}.parquet")
|
||||
|
||||
# Verify neither column exists
|
||||
assert "external_links" not in test.columns, "external_links column should NOT exist without --external-links flag"
|
||||
assert "citations" not in test.columns, "citations column should NOT exist without --citations flag"
|
||||
|
||||
print(f"No wikitext columns test passed! {len(test)} rows processed")
|
||||
|
||||
Reference in New Issue
Block a user