Files
mediawiki_dump_tools/test/test_resume.py

550 lines
23 KiB
Python

import json
import os
import shutil
import signal
import subprocess
import sys
import tempfile
import time
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pytest
from pandas.testing import assert_frame_equal
from wikiq.resume import (
cleanup_interrupted_resume,
get_checkpoint_path,
get_resume_point,
merge_parquet_files,
)
from wikiq_test_utils import (
SAILORMOON,
TEST_DIR,
TEST_OUTPUT_DIR,
WIKIQ,
WikiqTester,
)
def test_resume():
"""Test that --resume properly resumes processing from the last written revid."""
tester_full = WikiqTester(SAILORMOON, "resume_full", in_compression="7z", out_format="parquet")
try:
tester_full.call_wikiq("--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
full_table = pq.read_table(full_output_path)
middle_idx = len(full_table) // 2
resume_revid = full_table.column("revid")[middle_idx].as_py()
print(f"Total revisions: {len(full_table)}, Resume point: {middle_idx}, Resume revid: {resume_revid}")
tester_partial = WikiqTester(SAILORMOON, "resume_partial", in_compression="7z", out_format="parquet")
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
partial_table = full_table.slice(0, middle_idx + 1)
pq.write_table(partial_table, partial_output_path)
try:
tester_partial.call_wikiq("--fandom-2020", "--resume")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
resumed_table = pq.read_table(partial_output_path)
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
print(f"Resume test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
def test_resume_with_diff():
"""Test that --resume works correctly with diff computation."""
tester_full = WikiqTester(SAILORMOON, "resume_diff_full", in_compression="7z", out_format="parquet")
try:
tester_full.call_wikiq("--diff", "--fandom-2020")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
full_table = pq.read_table(full_output_path)
resume_idx = len(full_table) // 3
resume_revid = full_table.column("revid")[resume_idx].as_py()
print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
tester_partial = WikiqTester(SAILORMOON, "resume_diff_partial", in_compression="7z", out_format="parquet")
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
partial_table = full_table.slice(0, resume_idx + 1)
pq.write_table(partial_table, partial_output_path)
try:
tester_partial.call_wikiq("--diff", "--fandom-2020", "--resume")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
resumed_table = pq.read_table(partial_output_path)
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
def test_resume_with_partition_namespaces():
"""Test that --resume works correctly with --partition-namespaces.
Interrupts wikiq partway through processing, then resumes and verifies
the result matches an uninterrupted run. Uses --flush-per-batch to ensure
data is written to disk after each batch, making interruption deterministic.
"""
full_dir = os.path.join(TEST_OUTPUT_DIR, "resume_full")
partial_dir = os.path.join(TEST_OUTPUT_DIR, "resume_partial")
input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
for output_dir in [full_dir, partial_dir]:
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.makedirs(output_dir)
full_output = os.path.join(full_dir, f"{SAILORMOON}.parquet")
partial_output = os.path.join(partial_dir, f"{SAILORMOON}.parquet")
cmd_full = f"{WIKIQ} {input_file} -o {full_output} --batch-size 10 --partition-namespaces"
try:
subprocess.check_output(cmd_full, stderr=subprocess.PIPE, shell=True)
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
full_dataset = ds.dataset(full_output, format="parquet", partitioning="hive")
full_df = full_dataset.to_table().to_pandas()
total_rows = len(full_df)
print(f"Full run produced {total_rows} rows")
batch_size = 10
cmd_partial = [
sys.executable, WIKIQ, input_file,
"-o", partial_output,
"--batch-size", str(batch_size),
"--partition-namespaces"
]
print(f"Starting: {' '.join(cmd_partial)}")
proc = subprocess.Popen(cmd_partial, stderr=subprocess.PIPE)
interrupt_delay = 5
time.sleep(interrupt_delay)
if proc.poll() is not None:
pytest.fail(f"wikiq completed in {interrupt_delay}s before we could interrupt")
print(f"Sending SIGUSR1 after {interrupt_delay}s")
proc.send_signal(signal.SIGUSR1)
try:
proc.wait(timeout=5)
print("Process exited gracefully after SIGUSR1")
except subprocess.TimeoutExpired:
print("Sending SIGTERM after SIGUSR1 timeout")
proc.send_signal(signal.SIGTERM)
proc.wait(timeout=30)
interrupted_dataset = ds.dataset(partial_output, format="parquet", partitioning="hive")
interrupted_rows = interrupted_dataset.count_rows()
print(f"Interrupted run wrote {interrupted_rows} rows")
assert interrupted_rows < total_rows, \
f"Process wrote all {interrupted_rows} rows before being killed"
cmd_resume = f"{WIKIQ} {input_file} -o {partial_output} --batch-size {batch_size} --partition-namespaces --resume"
try:
subprocess.check_output(cmd_resume, stderr=subprocess.PIPE, shell=True)
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
resumed_dataset = ds.dataset(partial_output, format="parquet", partitioning="hive")
resumed_df = resumed_dataset.to_table().to_pandas()
full_revids = set(full_df['revid'])
resumed_revids = set(resumed_df['revid'])
missing_revids = full_revids - resumed_revids
extra_revids = resumed_revids - full_revids
assert missing_revids == set() and extra_revids == set(), \
f"Revision ID mismatch: {len(missing_revids)} missing, {len(extra_revids)} extra. Missing: {sorted(missing_revids)[:10]}"
assert len(resumed_df) == len(full_df), \
f"Row count mismatch: {len(resumed_df)} vs {len(full_df)}"
print(f"Resume test passed! Full: {len(full_df)}, Interrupted: {interrupted_rows}, Resumed: {len(resumed_df)}")
def test_resume_file_not_found():
"""Test that --resume starts fresh when output file doesn't exist."""
tester = WikiqTester(SAILORMOON, "resume_not_found", in_compression="7z", out_format="parquet")
expected_output = os.path.join(tester.output, f"{SAILORMOON}.parquet")
if os.path.exists(expected_output):
os.remove(expected_output)
# Should succeed by starting fresh
tester.call_wikiq("--resume")
# Verify output was created
assert os.path.exists(expected_output), "Output file should be created when starting fresh"
table = pq.read_table(expected_output)
assert table.num_rows > 0, "Output should have data"
print("Resume file not found test passed - started fresh!")
def test_resume_simple():
"""Test that --resume works without --fandom-2020 and --partition-namespaces."""
tester_full = WikiqTester(SAILORMOON, "resume_simple_full", in_compression="7z", out_format="parquet")
try:
tester_full.call_wikiq()
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
full_table = pq.read_table(full_output_path)
resume_idx = len(full_table) // 3
resume_revid = full_table.column("revid")[resume_idx].as_py()
print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
tester_partial = WikiqTester(SAILORMOON, "resume_simple_partial", in_compression="7z", out_format="parquet")
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
partial_table = full_table.slice(0, resume_idx + 1)
pq.write_table(partial_table, partial_output_path)
try:
tester_partial.call_wikiq("--resume")
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
resumed_table = pq.read_table(partial_output_path)
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
print(f"Resume simple test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
def test_resume_merge_with_invalid_temp_file():
"""Test that resume handles invalid/empty temp files gracefully.
This can happen when a namespace has no records after the resume point,
resulting in a temp file that was created but never written to.
"""
with tempfile.TemporaryDirectory() as tmpdir:
original_path = os.path.join(tmpdir, "original.parquet")
temp_path = os.path.join(tmpdir, "temp.parquet")
merged_path = os.path.join(tmpdir, "merged.parquet")
table = pa.table({"articleid": [1, 2, 3], "revid": [10, 20, 30]})
pq.write_table(table, original_path)
with open(temp_path, 'w') as f:
f.write("")
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result == "original_only", f"Expected 'original_only' when temp file is invalid, got {result}"
assert os.path.exists(original_path), "Original file should still exist"
original_table = pq.read_table(original_path)
assert len(original_table) == 3, "Original file should be unchanged"
assert not os.path.exists(merged_path), "Merged file should not be created"
print("Resume merge with invalid temp file test passed!")
def test_resume_merge_with_corrupted_original():
"""Test that resume recovers from a corrupted original file if temp is valid.
This can happen if the original file was being written when the process
was killed, leaving it in a corrupted state.
"""
with tempfile.TemporaryDirectory() as tmpdir:
original_path = os.path.join(tmpdir, "original.parquet")
temp_path = os.path.join(tmpdir, "temp.parquet")
merged_path = os.path.join(tmpdir, "merged.parquet")
with open(original_path, 'w') as f:
f.write("corrupted data")
table = pa.table({"articleid": [4, 5, 6], "revid": [40, 50, 60]})
pq.write_table(table, temp_path)
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result == "temp_only", f"Expected 'temp_only' when original is corrupted, got {result}"
assert not os.path.exists(merged_path), "Merged file should not be created for temp_only case"
print("Resume merge with corrupted original test passed!")
def test_resume_merge_both_invalid():
"""Test that resume handles both files being invalid."""
with tempfile.TemporaryDirectory() as tmpdir:
original_path = os.path.join(tmpdir, "original.parquet")
temp_path = os.path.join(tmpdir, "temp.parquet")
merged_path = os.path.join(tmpdir, "merged.parquet")
with open(original_path, 'w') as f:
f.write("corrupted original")
with open(temp_path, 'w') as f:
f.write("corrupted temp")
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result == "both_invalid", f"Expected 'both_invalid' when both files corrupted, got {result}"
print("Resume merge with both invalid test passed!")
def test_cleanup_interrupted_resume_both_corrupted():
"""Test that cleanup_interrupted_resume returns 'start_fresh' when both files are corrupted."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "output.parquet")
temp_file = output_file + ".resume_temp"
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces=False)
with open(output_file, 'w') as f:
f.write("corrupted original")
with open(temp_file, 'w') as f:
f.write("corrupted temp")
with open(checkpoint_path, 'w') as f:
json.dump({"pageid": 100, "revid": 200}, f)
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
assert result == "start_fresh", f"Expected 'start_fresh', got {result}"
assert not os.path.exists(output_file), "Corrupted original should be deleted"
assert not os.path.exists(temp_file), "Corrupted temp should be deleted"
assert not os.path.exists(checkpoint_path), "Stale checkpoint should be deleted"
print("Cleanup interrupted resume with both corrupted test passed!")
def test_cleanup_interrupted_resume_original_corrupted_temp_valid():
"""Test that cleanup recovers from temp when original is corrupted."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "output.parquet")
temp_file = output_file + ".resume_temp"
with open(output_file, 'w') as f:
f.write("corrupted original")
table = pa.table({"articleid": [10, 20, 30], "revid": [100, 200, 300]})
pq.write_table(table, temp_file)
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
assert result is None, f"Expected None (normal recovery), got {result}"
assert os.path.exists(output_file), "Output file should exist after recovery"
assert not os.path.exists(temp_file), "Temp file should be renamed to output"
recovered_table = pq.read_table(output_file)
assert len(recovered_table) == 3, "Recovered file should have 3 rows"
resume_point = get_resume_point(output_file, partition_namespaces=False)
assert resume_point is not None, "Should find resume point from recovered file"
assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"
print("Cleanup with original corrupted, temp valid test passed!")
def test_cleanup_original_missing_temp_valid_no_checkpoint():
"""Test recovery when original is missing, temp is valid, and no checkpoint exists."""
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "output.parquet")
temp_file = output_file + ".resume_temp"
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces=False)
assert not os.path.exists(output_file)
table = pa.table({"articleid": [10, 20, 30], "revid": [100, 200, 300]})
pq.write_table(table, temp_file)
assert not os.path.exists(checkpoint_path)
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
assert result is None, f"Expected None (normal recovery), got {result}"
assert os.path.exists(output_file), "Output file should exist after recovery"
assert not os.path.exists(temp_file), "Temp file should be renamed to output"
resume_point = get_resume_point(output_file, partition_namespaces=False)
assert resume_point is not None, "Should find resume point from recovered file"
assert resume_point == (30, 300, 0), f"Expected (30, 300, 0), got {resume_point}"
print("Original missing, temp valid, no checkpoint test passed!")
def test_concurrent_jobs_different_input_files():
"""Test that merge only processes temp files for the current input file.
When multiple wikiq processes write to the same partitioned output directory
with different input files, each process should only merge its own temp files.
"""
from wikiq.resume import merge_partitioned_namespaces
with tempfile.TemporaryDirectory() as tmpdir:
# Create partitioned output structure
ns0_dir = os.path.join(tmpdir, "namespace=0")
ns1_dir = os.path.join(tmpdir, "namespace=1")
os.makedirs(ns0_dir)
os.makedirs(ns1_dir)
# Simulate two different input files producing output
file1 = "enwiki-20250123-pages-meta-history24-p1p100.parquet"
file2 = "enwiki-20250123-pages-meta-history24-p101p200.parquet"
# Create original and temp files for file1
table1_orig = pa.table({"articleid": [1, 2], "revid": [10, 20]})
table1_temp = pa.table({"articleid": [3, 4], "revid": [30, 40]})
pq.write_table(table1_orig, os.path.join(ns0_dir, file1))
pq.write_table(table1_temp, os.path.join(ns0_dir, file1 + ".resume_temp"))
pq.write_table(table1_orig, os.path.join(ns1_dir, file1))
pq.write_table(table1_temp, os.path.join(ns1_dir, file1 + ".resume_temp"))
# Create original and temp files for file2 (simulating another concurrent job)
table2_orig = pa.table({"articleid": [100, 200], "revid": [1000, 2000]})
table2_temp = pa.table({"articleid": [300, 400], "revid": [3000, 4000]})
pq.write_table(table2_orig, os.path.join(ns0_dir, file2))
pq.write_table(table2_temp, os.path.join(ns0_dir, file2 + ".resume_temp"))
pq.write_table(table2_orig, os.path.join(ns1_dir, file2))
pq.write_table(table2_temp, os.path.join(ns1_dir, file2 + ".resume_temp"))
# Merge only file1's temp files
merge_partitioned_namespaces(tmpdir, ".resume_temp", file1)
# Verify file1's temp files were merged and removed
assert not os.path.exists(os.path.join(ns0_dir, file1 + ".resume_temp")), \
"file1 temp should be merged in ns0"
assert not os.path.exists(os.path.join(ns1_dir, file1 + ".resume_temp")), \
"file1 temp should be merged in ns1"
# Verify file1's original now has merged data
merged1_ns0 = pq.read_table(os.path.join(ns0_dir, file1))
merged1_ns1 = pq.read_table(os.path.join(ns1_dir, file1))
assert merged1_ns0.num_rows == 4, f"file1 ns0 should have 4 rows after merge, got {merged1_ns0.num_rows}"
assert merged1_ns1.num_rows == 4, f"file1 ns1 should have 4 rows after merge, got {merged1_ns1.num_rows}"
# Verify file2's temp files are UNTOUCHED (still exist)
assert os.path.exists(os.path.join(ns0_dir, file2 + ".resume_temp")), \
"file2 temp should NOT be touched in ns0"
assert os.path.exists(os.path.join(ns1_dir, file2 + ".resume_temp")), \
"file2 temp should NOT be touched in ns1"
# Verify file2's original is unchanged
orig2_ns0 = pq.read_table(os.path.join(ns0_dir, file2))
orig2_ns1 = pq.read_table(os.path.join(ns1_dir, file2))
assert orig2_ns0.num_rows == 2, "file2 ns0 should still have 2 rows"
assert orig2_ns1.num_rows == 2, "file2 ns1 should still have 2 rows"
print("Concurrent jobs with different input files test passed!")
def test_max_revisions_per_file_creates_parts():
"""Test that --max-revisions-per-file creates multiple part files."""
import re
tester = WikiqTester(SAILORMOON, "max_revs_parts", in_compression="7z", out_format="parquet")
max_revs = 50
try:
# Use a very small limit to force multiple parts
tester.call_wikiq("--fandom-2020", "--max-revisions-per-file", str(max_revs))
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
output_dir = tester.output
all_parquet = [f for f in os.listdir(output_dir) if f.endswith(".parquet") and ".part" in f]
# Sort by part number numerically
def get_part_num(filename):
match = re.search(r'\.part(\d+)\.parquet$', filename)
return int(match.group(1)) if match else 0
part_files = sorted(all_parquet, key=get_part_num)
assert len(part_files) > 1, f"Expected multiple part files, got {part_files}"
# Read all parts and verify total rows
total_rows = 0
for f in part_files:
table = pq.read_table(os.path.join(output_dir, f))
total_rows += len(table)
assert total_rows > 0, "Should have some rows across all parts"
# Each part (except the last) should have at least max_revisions rows
# (rotation happens after the batch that hits the limit is written)
for f in part_files[:-1]:
table = pq.read_table(os.path.join(output_dir, f))
assert len(table) >= max_revs, f"Part file {f} should have at least {max_revs} rows, got {len(table)}"
print(f"max-revisions-per-file test passed! Created {len(part_files)} parts with {total_rows} total rows")
def test_max_revisions_per_file_with_partitioned():
"""Test that --max-revisions-per-file works with partitioned namespace output."""
import re
tester = WikiqTester(SAILORMOON, "max_revs_partitioned", in_compression="7z", out_format="parquet")
max_revs = 20
try:
# Use a small limit to force parts, with partitioned output
tester.call_wikiq("--fandom-2020", "--partition-namespaces", "--max-revisions-per-file", str(max_revs))
except subprocess.CalledProcessError as exc:
pytest.fail(exc.stderr.decode("utf8"))
output_dir = tester.output
# Find namespace directories
ns_dirs = [d for d in os.listdir(output_dir) if d.startswith("namespace=")]
assert len(ns_dirs) > 0, "Should have namespace directories"
def get_part_num(filename):
match = re.search(r'\.part(\d+)\.parquet$', filename)
return int(match.group(1)) if match else 0
# Check that at least one namespace has multiple parts
found_multi_part = False
for ns_dir in ns_dirs:
ns_path = os.path.join(output_dir, ns_dir)
parquet_files = [f for f in os.listdir(ns_path) if f.endswith(".parquet")]
part_files = [f for f in parquet_files if ".part" in f]
if len(part_files) > 1:
found_multi_part = True
# Sort by part number and verify each part (except last) has at least limit rows
sorted_parts = sorted(part_files, key=get_part_num)
for f in sorted_parts[:-1]:
pf = pq.ParquetFile(os.path.join(ns_path, f))
num_rows = pf.metadata.num_rows
assert num_rows >= max_revs, f"Part file {f} in {ns_dir} should have at least {max_revs} rows, got {num_rows}"
assert found_multi_part, "At least one namespace should have multiple part files"
print(f"max-revisions-per-file with partitioned output test passed!")