Add per-namespace resume support for partitioned parquet output.

- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid)) to correctly handle interleaved dump ordering in partitioned output - Extract resume functionality to dedicated resume.py module - Add graceful shutdown handling via shutdown_requested flag (CLI-level only) - Use lazy ParquetWriter creation to avoid empty files on early exit - Refactor writing logic to _write_batch() helper method - Simplify control flow by replacing continue statements with should_write flag
2025-12-06 06:56:19 -08:00
parent d69d8b0df2
commit 577ddc87f5
3 changed files with 632 additions and 325 deletions
--- a/src/wikiq/resume.py
+++ b/src/wikiq/resume.py
@@ -0,0 +1,296 @@
+"""
+Checkpoint and resume functionality for wikiq parquet output.
+
+This module handles:
+- Finding resume points in existing parquet output
+- Merging resumed data with existing output (streaming, memory-efficient)
+"""
+
+import os
+import sys
+
+import pyarrow.dataset as ds
+import pyarrow.parquet as pq
+import pyarrow.compute as pc
+
+
+def get_resume_point(output_file, partition_namespaces=False):
+    """
+    Find the resume point(s) from existing parquet output.
+
+    Args:
+        output_file: Path to the output file. For single files, this is the parquet file path.
+                     For partitioned namespaces, this is the path like dir/dump.parquet where
+                     namespace=* subdirectories are in the parent dir.
+        partition_namespaces: Whether the output uses namespace partitioning.
+
+    Returns:
+        For single files: A tuple (pageid, revid) for the row with the highest pageid,
+                         or None if not found.
+        For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
+                        or None if no partitions exist.
+    """
+    try:
+        if partition_namespaces:
+            return _get_resume_point_partitioned(output_file)
+        else:
+            return _get_resume_point_single_file(output_file)
+    except Exception as e:
+        print(f"Error reading resume point from {output_file}: {e}", file=sys.stderr)
+        return None
+
+
+def _get_resume_point_partitioned(output_file):
+    """Find per-namespace resume points from partitioned output.
+
+    Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition.
+    This allows resume to correctly handle cases where different namespaces have
+    different progress due to interleaved dump ordering.
+    """
+    partition_dir = os.path.dirname(output_file)
+    if not os.path.exists(partition_dir) or not os.path.isdir(partition_dir):
+        return None
+
+    namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
+    if not namespace_dirs:
+        return None
+
+    resume_points = {}
+    for ns_dir in namespace_dirs:
+        ns = int(ns_dir.split('=')[1])
+        ns_path = os.path.join(partition_dir, ns_dir)
+
+        # Find parquet files in this namespace directory
+        parquet_files = [f for f in os.listdir(ns_path) if f.endswith('.parquet')]
+        if not parquet_files:
+            continue
+
+        # Read all parquet files in this namespace
+        for pq_file in parquet_files:
+            pq_path = os.path.join(ns_path, pq_file)
+            try:
+                pf = pq.ParquetFile(pq_path)
+                table = pf.read(columns=['articleid', 'revid'])
+                if table.num_rows == 0:
+                    continue
+
+                max_pageid = pc.max(table['articleid']).as_py()
+                mask = pc.equal(table['articleid'], max_pageid)
+                max_revid = pc.max(pc.filter(table['revid'], mask)).as_py()
+
+                # Keep the highest pageid for this namespace
+                if ns not in resume_points or max_pageid > resume_points[ns][0]:
+                    resume_points[ns] = (max_pageid, max_revid)
+            except Exception as e:
+                print(f"Warning: Could not read {pq_path}: {e}", file=sys.stderr)
+                continue
+
+    return resume_points if resume_points else None
+
+
+def _get_resume_point_single_file(output_file):
+    """Find resume point from a single parquet file."""
+    if not os.path.exists(output_file):
+        return None
+
+    if os.path.isdir(output_file):
+        return None
+
+    # Find the row with the highest pageid
+    pf = pq.ParquetFile(output_file)
+    table = pf.read(columns=['articleid', 'revid'])
+
+    if table.num_rows == 0:
+        return None
+
+    max_pageid = pc.max(table['articleid']).as_py()
+    # Filter to row(s) with max pageid and get max revid
+    mask = pc.equal(table['articleid'], max_pageid)
+    max_revid = pc.max(pc.filter(table['revid'], mask)).as_py()
+    return (max_pageid, max_revid)
+
+
+def merge_parquet_files(original_path, temp_path, merged_path):
+    """
+    Merge two parquet files by streaming row groups from original and temp into merged.
+
+    This is memory-efficient: only one row group is loaded at a time.
+    Returns True if merged file was created, False if both sources were empty.
+    """
+    original_pq = pq.ParquetFile(original_path)
+    temp_pq = pq.ParquetFile(temp_path)
+
+    merged_writer = None
+
+    # Copy all row groups from the original file
+    for i in range(original_pq.num_row_groups):
+        row_group = original_pq.read_row_group(i)
+        if merged_writer is None:
+            merged_writer = pq.ParquetWriter(
+                merged_path,
+                row_group.schema,
+                flavor="spark"
+            )
+        merged_writer.write_table(row_group)
+
+    # Append all row groups from the temp file
+    for i in range(temp_pq.num_row_groups):
+        row_group = temp_pq.read_row_group(i)
+        if merged_writer is None:
+            merged_writer = pq.ParquetWriter(
+                merged_path,
+                row_group.schema,
+                flavor="spark"
+            )
+        merged_writer.write_table(row_group)
+
+    # Close the writer
+    if merged_writer is not None:
+        merged_writer.close()
+        return True
+    return False
+
+
+def merge_partitioned_namespaces(partition_dir, temp_suffix):
+    """
+    Merge partitioned namespace directories after resume.
+
+    For partitioned namespaces, temp files are written alongside the original files
+    in each namespace directory with the temp suffix appended to the filename.
+    E.g., original: namespace=0/file.parquet, temp: namespace=0/file.parquet.resume_temp
+
+    Args:
+        partition_dir: The partition directory containing namespace=* subdirs
+        temp_suffix: The suffix appended to temp files (e.g., '.resume_temp')
+    """
+    namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
+
+    for ns_dir in namespace_dirs:
+        ns_path = os.path.join(partition_dir, ns_dir)
+
+        # Find all files in this namespace directory
+        files = os.listdir(ns_path)
+
+        # Find temp files (files ending with the temp suffix)
+        temp_files = [f for f in files if f.endswith(temp_suffix)]
+
+        for temp_file in temp_files:
+            temp_path = os.path.join(ns_path, temp_file)
+            # Original file is the temp file without the suffix
+            original_file = temp_file[:-len(temp_suffix)]
+            original_path = os.path.join(ns_path, original_file)
+
+            if os.path.exists(original_path):
+                # Merge the files
+                merged_path = original_path + ".merged"
+                merged = merge_parquet_files(original_path, temp_path, merged_path)
+
+                if merged:
+                    # Replace the original file with the merged file
+                    os.remove(original_path)
+                    os.rename(merged_path, original_path)
+                    os.remove(temp_path)
+                else:
+                    # Both files were empty, just remove them
+                    os.remove(original_path)
+                    os.remove(temp_path)
+            else:
+                # No original file, rename temp to original
+                os.rename(temp_path, original_path)
+
+
+def finalize_resume_merge(
+    original_output_file,
+    temp_output_file,
+    partition_namespaces,
+    original_partition_dir
+):
+    """
+    Finalize the resume by merging temp output with original output.
+
+    Args:
+        original_output_file: Path to the original output file
+        temp_output_file: Path to the temp output file written during resume
+        partition_namespaces: Whether using partitioned namespace output
+        original_partition_dir: The partition directory (for partitioned output)
+
+    Raises:
+        Exception: If merge fails (temp file is preserved for recovery)
+    """
+    import shutil
+
+    print("Merging resumed data with existing output...", file=sys.stderr)
+    try:
+        if partition_namespaces and original_partition_dir is not None:
+            # For partitioned namespaces, temp files are written alongside originals
+            # with '.resume_temp' suffix in each namespace directory.
+            merge_partitioned_namespaces(original_partition_dir, ".resume_temp")
+            # Clean up the empty temp directory we created
+            if os.path.exists(temp_output_file) and os.path.isdir(temp_output_file):
+                shutil.rmtree(temp_output_file)
+        else:
+            # Merge single parquet files
+            merged_output_file = original_output_file + ".merged"
+            merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
+
+            # Replace the original file with the merged file
+            os.remove(original_output_file)
+            os.rename(merged_output_file, original_output_file)
+
+            # Clean up the temp file
+            if os.path.exists(temp_output_file):
+                os.remove(temp_output_file)
+
+        print("Merge complete.", file=sys.stderr)
+    except Exception as e:
+        print(f"Error merging resume data: {e}", file=sys.stderr)
+        print(f"New data saved in: {temp_output_file}", file=sys.stderr)
+        raise
+
+
+def setup_resume_temp_output(output_file, partition_namespaces):
+    """
+    Set up temp output for resume mode.
+
+    Args:
+        output_file: The original output file path
+        partition_namespaces: Whether using partitioned namespace output
+
+    Returns:
+        Tuple of (original_output_file, temp_output_file, original_partition_dir)
+        or (None, None, None) if no existing output to resume from.
+    """
+    import shutil
+
+    original_output_file = None
+    temp_output_file = None
+    original_partition_dir = None
+
+    # For partitioned namespaces, check if the partition directory exists
+    if partition_namespaces:
+        partition_dir = os.path.dirname(output_file)
+        output_exists = os.path.isdir(partition_dir) and any(
+            d.startswith('namespace=') for d in os.listdir(partition_dir)
+        )
+        if output_exists:
+            original_partition_dir = partition_dir
+    else:
+        output_exists = isinstance(output_file, str) and os.path.exists(output_file)
+
+    if output_exists:
+        original_output_file = output_file
+        temp_output_file = output_file + ".resume_temp"
+
+        # Remove temp file/dir if it exists from a previous failed run
+        if os.path.exists(temp_output_file):
+            if os.path.isdir(temp_output_file):
+                shutil.rmtree(temp_output_file)
+            else:
+                os.remove(temp_output_file)
+
+        # For partitioned namespaces, create an empty temp directory
+        # (actual temp files go in namespace=* dirs with .resume_temp suffix)
+        if partition_namespaces:
+            os.makedirs(temp_output_file, exist_ok=True)
+
+    return original_output_file, temp_output_file, original_partition_dir