Add per-namespace resume support for partitioned parquet output.

- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid)) to correctly handle interleaved dump ordering in partitioned output - Extract resume functionality to dedicated resume.py module - Add graceful shutdown handling via shutdown_requested flag (CLI-level only) - Use lazy ParquetWriter creation to avoid empty files on early exit - Refactor writing logic to _write_batch() helper method - Simplify control flow by replacing continue statements with should_write flag
2025-12-06 06:56:19 -08:00
parent d69d8b0df2
commit 577ddc87f5
3 changed files with 632 additions and 325 deletions
--- a/test/Wikiq_Unit_Test.py
+++ b/test/Wikiq_Unit_Test.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 import subprocess
+import sys
 import tracemalloc
 from io import StringIO
 from typing import Final, Union
@@ -539,97 +540,108 @@ def test_resume_with_diff():
    print(f"Resume with diff test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")

 def test_resume_with_partition_namespaces():
-    """Test that --resume works correctly with --partition-namespaces."""
-    import pyarrow.parquet as pq
+    """Test that --resume works correctly with --partition-namespaces.

-    # First, create a complete baseline output with partition-namespaces
-    tester_full = WikiqTester(SAILORMOON, "resume_partition_full", in_compression="7z", out_format="parquet")
+    Interrupts wikiq partway through processing, then resumes and verifies
+    the result matches an uninterrupted run. Uses --flush-per-batch to ensure
+    data is written to disk after each batch, making interruption deterministic.
+    """
+    import signal
+    import time
+    import pyarrow.dataset as ds

+    # Use separate subdirectories for full and partial runs to isolate them
+    full_dir = os.path.join(TEST_OUTPUT_DIR, "resume_full")
+    partial_dir = os.path.join(TEST_OUTPUT_DIR, "resume_partial")
+    input_file = os.path.join(TEST_DIR, "dumps", f"{SAILORMOON}.xml.7z")
+
+    # Clean up any existing output directories from previous runs
+    for output_dir in [full_dir, partial_dir]:
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+        os.makedirs(output_dir)
+
+    # Paths within each isolated directory
+    full_output = os.path.join(full_dir, f"{SAILORMOON}.parquet")
+    partial_output = os.path.join(partial_dir, f"{SAILORMOON}.parquet")
+
+    # Run wikiq fully to get baseline output
+    cmd_full = f"{WIKIQ} {input_file} -o {full_output} --batch-size 10 --partition-namespaces"
    try:
-        tester_full.call_wikiq("--partition-namespaces", "--fandom-2020")
+        subprocess.check_output(cmd_full, stderr=subprocess.PIPE, shell=True)
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))

-    # Read the full output from the partitioned directory
-    full_output_dir = tester_full.output
-    namespace_dirs = [d for d in os.listdir(full_output_dir) if d.startswith('namespace=')]
+    # Read full output
+    full_dataset = ds.dataset(full_output, format="parquet", partitioning="hive")
+    full_df = full_dataset.to_table().to_pandas()
+    total_rows = len(full_df)
+    print(f"Full run produced {total_rows} rows")

-    if not namespace_dirs:
-        pytest.fail("No namespace directories found in output")
+    # Start wikiq for the interrupted run (use list args so SIGTERM goes to Python)
+    batch_size = 10
+    cmd_partial = [
+        sys.executable, WIKIQ, input_file,
+        "-o", partial_output,
+        "--batch-size", str(batch_size),
+        "--partition-namespaces"
+    ]
+    print(f"Starting: {' '.join(cmd_partial)}")

-    # Collect all revisions from all namespaces
-    full_revids = []
-    for ns_dir in sorted(namespace_dirs):
-        parquet_files = [f for f in os.listdir(os.path.join(full_output_dir, ns_dir)) if f.endswith('.parquet')]
-        if parquet_files:
-            ns_parquet_path = os.path.join(full_output_dir, ns_dir, parquet_files[0])
-            pf = pq.ParquetFile(ns_parquet_path)
-            table = pf.read(columns=['revid'])
-            revids = table.column('revid').to_pylist()
-            full_revids.extend(revids)
+    proc = subprocess.Popen(cmd_partial, stderr=subprocess.PIPE)

-    full_revids_sorted = sorted(set(full_revids))
-    total_revisions = len(full_revids_sorted)
+    # Wait a short time to allow some processing
+    interrupt_delay = 5  # seconds - enough for some pages but not all
+    time.sleep(interrupt_delay)

-    # Get a revid about 1/3 through to use as the resume point
-    resume_idx = total_revisions // 3
-    resume_revid = full_revids_sorted[resume_idx]
+    if proc.poll() is not None:
+        pytest.fail(f"wikiq completed in {interrupt_delay}s before we could interrupt")

-    print(f"Total revisions: {total_revisions}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
+    # Simulate SLURM job termination: send SIGUSR1 first (early warning),
+    # then wait for graceful shutdown, then SIGTERM if still running
+    print(f"Sending SIGUSR1 after {interrupt_delay}s")
+    proc.send_signal(signal.SIGUSR1)

-    # Create a partial output by manually creating the partitioned structure
-    tester_partial = WikiqTester(SAILORMOON, "resume_partition_partial", in_compression="7z", out_format="parquet")
-    partial_output_dir = tester_partial.output
-
-    # Copy the full partitioned output to the partial directory
-    for ns_dir in namespace_dirs:
-        src_ns_path = os.path.join(full_output_dir, ns_dir)
-        dst_ns_path = os.path.join(partial_output_dir, ns_dir)
-        shutil.copytree(src_ns_path, dst_ns_path)
-
-    # Now filter each namespace file to only include revisions up to resume_idx
-    revised_data_count = 0
-    for ns_dir in namespace_dirs:
-        parquet_files = [f for f in os.listdir(os.path.join(partial_output_dir, ns_dir)) if f.endswith('.parquet')]
-        if parquet_files:
-            ns_parquet_path = os.path.join(partial_output_dir, ns_dir, parquet_files[0])
-            pf = pq.ParquetFile(ns_parquet_path)
-            table = pf.read()
-
-            # Filter to only rows up to the resume point
-            revids = table.column('revid').to_pylist()
-            mask = pa.array([revid <= resume_revid for revid in revids], type=pa.bool_())
-            partial_table = table.filter(mask)
-            revised_data_count += len(partial_table)
-
-            # Write back the filtered data
-            pq.write_table(partial_table, ns_parquet_path)
-
-    print(f"Created partial output with {revised_data_count} revisions (up to revid {resume_revid})")
-
-    # Now resume from the partial output
+    # Wait for graceful shutdown
    try:
-        tester_partial.call_wikiq("--partition-namespaces", "--fandom-2020", "--resume")
+        proc.wait(timeout=5)
+        print("Process exited gracefully after SIGUSR1")
+    except subprocess.TimeoutExpired:
+        # Process didn't exit, send SIGTERM
+        print("Sending SIGTERM after SIGUSR1 timeout")
+        proc.send_signal(signal.SIGTERM)
+        proc.wait(timeout=30)
+
+    # Read interrupted output
+    interrupted_dataset = ds.dataset(partial_output, format="parquet", partitioning="hive")
+    interrupted_rows = interrupted_dataset.count_rows()
+    print(f"Interrupted run wrote {interrupted_rows} rows")
+
+    assert interrupted_rows < total_rows, \
+        f"Process wrote all {interrupted_rows} rows before being killed"
+
+    # Resume
+    cmd_resume = f"{WIKIQ} {input_file} -o {partial_output} --batch-size {batch_size} --partition-namespaces --resume"
+    try:
+        subprocess.check_output(cmd_resume, stderr=subprocess.PIPE, shell=True)
    except subprocess.CalledProcessError as exc:
        pytest.fail(exc.stderr.decode("utf8"))

-    # Read the resumed output and collect revids
-    resumed_revids = []
-    for ns_dir in namespace_dirs:
-        parquet_files = [f for f in os.listdir(os.path.join(partial_output_dir, ns_dir)) if f.endswith('.parquet')]
-        if parquet_files:
-            ns_parquet_path = os.path.join(partial_output_dir, ns_dir, parquet_files[0])
-            pf = pq.ParquetFile(ns_parquet_path)
-            table = pf.read(columns=['revid'])
-            revids = table.column('revid').to_pylist()
-            resumed_revids.extend(revids)
+    # Read resumed output
+    resumed_dataset = ds.dataset(partial_output, format="parquet", partitioning="hive")
+    resumed_df = resumed_dataset.to_table().to_pandas()

-    resumed_revids_sorted = sorted(set(resumed_revids))
+    # Check revid sets match (the important invariant)
+    full_revids = set(full_df['revid'])
+    resumed_revids = set(resumed_df['revid'])
+    missing_revids = full_revids - resumed_revids
+    extra_revids = resumed_revids - full_revids
+    assert missing_revids == set() and extra_revids == set(), \
+        f"Revision ID mismatch: {len(missing_revids)} missing, {len(extra_revids)} extra. Missing: {sorted(missing_revids)[:10]}"
+    assert len(resumed_df) == len(full_df), \
+        f"Row count mismatch: {len(resumed_df)} vs {len(full_df)}"

-    # Compare the revids
-    assert resumed_revids_sorted == full_revids_sorted, f"Resumed revids mismatch: {len(resumed_revids_sorted)} vs {len(full_revids_sorted)}"
-
-    print(f"Resume with partition-namespaces test passed! Original: {len(full_revids_sorted)} revisions, Resumed: {len(resumed_revids_sorted)} revisions")
+    print(f"Resume test passed! Full: {len(full_df)}, Interrupted: {interrupted_rows}, Resumed: {len(resumed_df)}")


 def test_external_links_only():
@@ -963,3 +975,71 @@ def test_headings():
                assert actual_list == expected, f"Row {idx}: headings mismatch"

    print(f"Headings test passed! {len(test)} rows processed")
+
+
+def test_resume_file_not_found():
+    """Test that --resume exits with error when output file doesn't exist."""
+    tester = WikiqTester(SAILORMOON, "resume_not_found", in_compression="7z", out_format="parquet")
+
+    # Ensure the output file does not exist
+    expected_output = os.path.join(tester.output, f"{SAILORMOON}.parquet")
+    if os.path.exists(expected_output):
+        os.remove(expected_output)
+
+    try:
+        tester.call_wikiq("--resume")
+        pytest.fail("Expected error when --resume is used but output file doesn't exist")
+    except subprocess.CalledProcessError as exc:
+        stderr = exc.stderr.decode("utf8")
+        assert "Error: --resume specified but output file not found" in stderr, \
+            f"Expected error message about missing output file, got: {stderr}"
+
+    print("Resume file not found test passed!")
+
+
+def test_resume_simple():
+    """Test that --resume works without --fandom-2020 and --partition-namespaces."""
+    import pyarrow.parquet as pq
+
+    # First, create a complete baseline output (no fandom-2020, no partition-namespaces)
+    tester_full = WikiqTester(SAILORMOON, "resume_simple_full", in_compression="7z", out_format="parquet")
+
+    try:
+        tester_full.call_wikiq()
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # Read the full output
+    full_output_path = os.path.join(tester_full.output, f"{SAILORMOON}.parquet")
+    full_table = pq.read_table(full_output_path)
+
+    # Get a revid about 1/3 through to use as the resume point
+    resume_idx = len(full_table) // 3
+    resume_revid = full_table.column("revid")[resume_idx].as_py()
+
+    print(f"Total revisions: {len(full_table)}, Resume point: {resume_idx}, Resume revid: {resume_revid}")
+
+    # Create a partial output by slicing the table
+    tester_partial = WikiqTester(SAILORMOON, "resume_simple_partial", in_compression="7z", out_format="parquet")
+    partial_output_path = os.path.join(tester_partial.output, f"{SAILORMOON}.parquet")
+
+    partial_table = full_table.slice(0, resume_idx + 1)
+    pq.write_table(partial_table, partial_output_path)
+
+    # Now resume from the partial output
+    try:
+        tester_partial.call_wikiq("--resume")
+    except subprocess.CalledProcessError as exc:
+        pytest.fail(exc.stderr.decode("utf8"))
+
+    # Read the resumed output
+    resumed_table = pq.read_table(partial_output_path)
+
+    # Convert to dataframes for comparison, sorting by revid
+    resumed_df = resumed_table.to_pandas().sort_values("revid").reset_index(drop=True)
+    full_df = full_table.to_pandas().sort_values("revid").reset_index(drop=True)
+
+    # Compare the dataframes
+    assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
+
+    print(f"Resume simple test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")