Add per-namespace resume support for partitioned parquet output.
- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid)) to correctly handle interleaved dump ordering in partitioned output - Extract resume functionality to dedicated resume.py module - Add graceful shutdown handling via shutdown_requested flag (CLI-level only) - Use lazy ParquetWriter creation to avoid empty files on early exit - Refactor writing logic to _write_batch() helper method - Simplify control flow by replacing continue statements with should_write flag
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tracemalloc
|
||||
from io import StringIO
|
||||
from typing import Final, Union
|
||||
@@ -539,97 +540,108 @@ def test_resume_with_diff():
|
||||
print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
|
||||
|
||||
def test_resume_with_partition_namespaces():
|
||||
"""Test that --resume works correctly with --partition-namespaces."""
|
||||
import pyarrow.parquet as pq
|
||||
"""Test that --resume works correctly with --partition-namespaces.
|
||||
|
||||
# First, create a complete baseline output with partition-namespaces
|
||||
tester_full = WikiqTester(SAILORMOON, "resume_partition_full", in_compression="7z", out_format="parquet")
|
||||
Interrupts wikiq partway through processing, then resumes and verifies
|
||||
the result matches an uninterrupted run. Uses --flush-per-batch to ensure
|
||||
data is written to disk after each batch, making interruption deterministic.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
import pyarrow.dataset as ds
|
||||
|
||||
# Use separate subdirectories for full and partial runs to isolate them
|
||||
full_dir = os.path.join(TEST_OUTPUT_DIR, "resume_full")
|
||||
partial_dir = os.path.join(TEST_OUTPUT_DIR, "resume_partial")
|
||||
input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
|
||||
|
||||
# Clean up any existing output directories from previous runs
|
||||
for output_dir in [full_dir, partial_dir]:
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
os.makedirs(output_dir)
|
||||
|
||||
# Paths within each isolated directory
|
||||
full_output = os.path.join(full_dir, f"{SAILORMOON}.parquet")
|
||||
partial_output = os.path.join(partial_dir, f"{SAILORMOON}.parquet")
|
||||
|
||||
# Run wikiq fully to get baseline output
|
||||
cmd_full = f"{WIKIQ} {input_file} -o {full_output} --batch-size 10 --partition-namespaces"
|
||||
try:
|
||||
tester_full.call_wikiq("--partition-namespaces", "--fandom-2020")
|
||||
subprocess.check_output(cmd_full, stderr=subprocess.PIPE, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the full output from the partitioned directory
|
||||
full_output_dir = tester_full.output
|
||||
namespace_dirs = [d for d in os.listdir(full_output_dir) if d.startswith('namespace=')]
|
||||
# Read full output
|
||||
full_dataset = ds.dataset(full_output, format="parquet", partitioning="hive")
|
||||
full_df = full_dataset.to_table().to_pandas()
|
||||
total_rows = len(full_df)
|
||||
print(f"Full run produced {total_rows} rows")
|
||||
|
||||
if not namespace_dirs:
|
||||
pytest.fail("No namespace directories found in output")
|
||||
# Start wikiq for the interrupted run (use list args so SIGTERM goes to Python)
|
||||
batch_size = 10
|
||||
cmd_partial = [
|
||||
sys.executable, WIKIQ, input_file,
|
||||
"-o", partial_output,
|
||||
"--batch-size", str(batch_size),
|
||||
"--partition-namespaces"
|
||||
]
|
||||
print(f"Starting: {' '.join(cmd_partial)}")
|
||||
|
||||
# Collect all revisions from all namespaces
|
||||
full_revids = []
|
||||
for ns_dir in sorted(namespace_dirs):
|
||||
parquet_files = [f for f in os.listdir(os.path.join(full_output_dir, ns_dir)) if f.endswith('.parquet')]
|
||||
if parquet_files:
|
||||
ns_parquet_path = os.path.join(full_output_dir, ns_dir, parquet_files[0])
|
||||
pf = pq.ParquetFile(ns_parquet_path)
|
||||
table = pf.read(columns=['revid'])
|
||||
revids = table.column('revid').to_pylist()
|
||||
full_revids.extend(revids)
|
||||
proc = subprocess.Popen(cmd_partial, stderr=subprocess.PIPE)
|
||||
|
||||
full_revids_sorted = sorted(set(full_revids))
|
||||
total_revisions = len(full_revids_sorted)
|
||||
# Wait a short time to allow some processing
|
||||
interrupt_delay = 5 # seconds - enough for some pages but not all
|
||||
time.sleep(interrupt_delay)
|
||||
|
||||
# Get a revid about 1/3 through to use as the resume point
|
||||
resume_idx = total_revisions // 3
|
||||
resume_revid = full_revids_sorted[resume_idx]
|
||||
if proc.poll() is not None:
|
||||
pytest.fail(f"wikiq completed in {interrupt_delay}s before we could interrupt")
|
||||
|
||||
print(f"Total revisions: {total_revisions}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
|
||||
# Simulate SLURM job termination: send SIGUSR1 first (early warning),
|
||||
# then wait for graceful shutdown, then SIGTERM if still running
|
||||
print(f"Sending SIGUSR1 after {interrupt_delay}s")
|
||||
proc.send_signal(signal.SIGUSR1)
|
||||
|
||||
# Create a partial output by manually creating the partitioned structure
|
||||
tester_partial = WikiqTester(SAILORMOON, "resume_partition_partial", in_compression="7z", out_format="parquet")
|
||||
partial_output_dir = tester_partial.output
|
||||
|
||||
# Copy the full partitioned output to the partial directory
|
||||
for ns_dir in namespace_dirs:
|
||||
src_ns_path = os.path.join(full_output_dir, ns_dir)
|
||||
dst_ns_path = os.path.join(partial_output_dir, ns_dir)
|
||||
shutil.copytree(src_ns_path, dst_ns_path)
|
||||
|
||||
# Now filter each namespace file to only include revisions up to resume_idx
|
||||
revised_data_count = 0
|
||||
for ns_dir in namespace_dirs:
|
||||
parquet_files = [f for f in os.listdir(os.path.join(partial_output_dir, ns_dir)) if f.endswith('.parquet')]
|
||||
if parquet_files:
|
||||
ns_parquet_path = os.path.join(partial_output_dir, ns_dir, parquet_files[0])
|
||||
pf = pq.ParquetFile(ns_parquet_path)
|
||||
table = pf.read()
|
||||
|
||||
# Filter to only rows up to the resume point
|
||||
revids = table.column('revid').to_pylist()
|
||||
mask = pa.array([revid <= resume_revid for revid in revids], type=pa.bool_())
|
||||
partial_table = table.filter(mask)
|
||||
revised_data_count += len(partial_table)
|
||||
|
||||
# Write back the filtered data
|
||||
pq.write_table(partial_table, ns_parquet_path)
|
||||
|
||||
print(f"Created partial output with {revised_data_count} revisions (up to revid {resume_revid})")
|
||||
|
||||
# Now resume from the partial output
|
||||
# Wait for graceful shutdown
|
||||
try:
|
||||
tester_partial.call_wikiq("--partition-namespaces", "--fandom-2020", "--resume")
|
||||
proc.wait(timeout=5)
|
||||
print("Process exited gracefully after SIGUSR1")
|
||||
except subprocess.TimeoutExpired:
|
||||
# Process didn't exit, send SIGTERM
|
||||
print("Sending SIGTERM after SIGUSR1 timeout")
|
||||
proc.send_signal(signal.SIGTERM)
|
||||
proc.wait(timeout=30)
|
||||
|
||||
# Read interrupted output
|
||||
interrupted_dataset = ds.dataset(partial_output, format="parquet", partitioning="hive")
|
||||
interrupted_rows = interrupted_dataset.count_rows()
|
||||
print(f"Interrupted run wrote {interrupted_rows} rows")
|
||||
|
||||
assert interrupted_rows < total_rows, \
|
||||
f"Process wrote all {interrupted_rows} rows before being killed"
|
||||
|
||||
# Resume
|
||||
cmd_resume = f"{WIKIQ} {input_file} -o {partial_output} --batch-size {batch_size} --partition-namespaces --resume"
|
||||
try:
|
||||
subprocess.check_output(cmd_resume, stderr=subprocess.PIPE, shell=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the resumed output and collect revids
|
||||
resumed_revids = []
|
||||
for ns_dir in namespace_dirs:
|
||||
parquet_files = [f for f in os.listdir(os.path.join(partial_output_dir, ns_dir)) if f.endswith('.parquet')]
|
||||
if parquet_files:
|
||||
ns_parquet_path = os.path.join(partial_output_dir, ns_dir, parquet_files[0])
|
||||
pf = pq.ParquetFile(ns_parquet_path)
|
||||
table = pf.read(columns=['revid'])
|
||||
revids = table.column('revid').to_pylist()
|
||||
resumed_revids.extend(revids)
|
||||
# Read resumed output
|
||||
resumed_dataset = ds.dataset(partial_output, format="parquet", partitioning="hive")
|
||||
resumed_df = resumed_dataset.to_table().to_pandas()
|
||||
|
||||
resumed_revids_sorted = sorted(set(resumed_revids))
|
||||
# Check revid sets match (the important invariant)
|
||||
full_revids = set(full_df['revid'])
|
||||
resumed_revids = set(resumed_df['revid'])
|
||||
missing_revids = full_revids - resumed_revids
|
||||
extra_revids = resumed_revids - full_revids
|
||||
assert missing_revids == set() and extra_revids == set(), \
|
||||
f"Revision ID mismatch: {len(missing_revids)} missing, {len(extra_revids)} extra. Missing: {sorted(missing_revids)[:10]}"
|
||||
assert len(resumed_df) == len(full_df), \
|
||||
f"Row count mismatch: {len(resumed_df)} vs {len(full_df)}"
|
||||
|
||||
# Compare the revids
|
||||
assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
|
||||
|
||||
print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
|
||||
print(f"Resume test passed! Full: {len(full_df)}, Interrupted: {interrupted_rows}, Resumed: {len(resumed_df)}")
|
||||
|
||||
|
||||
def test_external_links_only():
|
||||
@@ -963,3 +975,71 @@ def test_headings():
|
||||
assert actual_list == expected, f"Row {idx}: headings mismatch"
|
||||
|
||||
print(f"Headings test passed! {len(test)} rows processed")
|
||||
|
||||
|
||||
def test_resume_file_not_found():
|
||||
"""Test that --resume exits with error when output file doesn't exist."""
|
||||
tester = WikiqTester(SAILORMOON, "resume_not_found", in_compression="7z", out_format="parquet")
|
||||
|
||||
# Ensure the output file does not exist
|
||||
expected_output = os.path.join(tester.output, f"{SAILORMOON}.parquet")
|
||||
if os.path.exists(expected_output):
|
||||
os.remove(expected_output)
|
||||
|
||||
try:
|
||||
tester.call_wikiq("--resume")
|
||||
pytest.fail("Expected error when --resume is used but output file doesn't exist")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
stderr = exc.stderr.decode("utf8")
|
||||
assert "Error: --resume specified but output file not found" in stderr, \
|
||||
f"Expected error message about missing output file, got: {stderr}"
|
||||
|
||||
print("Resume file not found test passed!")
|
||||
|
||||
|
||||
def test_resume_simple():
|
||||
"""Test that --resume works without --fandom-2020 and --partition-namespaces."""
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
# First, create a complete baseline output (no fandom-2020, no partition-namespaces)
|
||||
tester_full = WikiqTester(SAILORMOON, "resume_simple_full", in_compression="7z", out_format="parquet")
|
||||
|
||||
try:
|
||||
tester_full.call_wikiq()
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the full output
|
||||
full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
|
||||
full_table = pq.read_table(full_output_path)
|
||||
|
||||
# Get a revid about 1/3 through to use as the resume point
|
||||
resume_idx = len(full_table) // 3
|
||||
resume_revid = full_table.column("revid")[resume_idx].as_py()
|
||||
|
||||
print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
|
||||
|
||||
# Create a partial output by slicing the table
|
||||
tester_partial = WikiqTester(SAILORMOON, "resume_simple_partial", in_compression="7z", out_format="parquet")
|
||||
partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
|
||||
|
||||
partial_table = full_table.slice(0, resume_idx + 1)
|
||||
pq.write_table(partial_table, partial_output_path)
|
||||
|
||||
# Now resume from the partial output
|
||||
try:
|
||||
tester_partial.call_wikiq("--resume")
|
||||
except subprocess.CalledProcessError as exc:
|
||||
pytest.fail(exc.stderr.decode("utf8"))
|
||||
|
||||
# Read the resumed output
|
||||
resumed_table = pq.read_table(partial_output_path)
|
||||
|
||||
# Convert to dataframes for comparison, sorting by revid
|
||||
resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
|
||||
full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
|
||||
|
||||
# Compare the dataframes
|
||||
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
|
||||
|
||||
print(f"Resume simple test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
|
||||
|
||||
Reference in New Issue
Block a user