Add per-namespace resume support for partitioned parquet output.

- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid))
  to correctly handle interleaved dump ordering in partitioned output
- Extract resume functionality to dedicated resume.py module
- Add graceful shutdown handling via shutdown_requested flag (CLI-level only)
- Use lazy ParquetWriter creation to avoid empty files on early exit
- Refactor writing logic to _write_batch() helper method
- Simplify control flow by replacing continue statements with should_write flag
This commit is contained in:
Nathan TeBlunthuis
2025-12-06 06:56:19 -08:00
parent d69d8b0df2
commit 577ddc87f5
3 changed files with 632 additions and 325 deletions

296
src/wikiq/resume.py Normal file
View File

@@ -0,0 +1,296 @@
"""
Checkpoint and resume functionality for wikiq parquet output.
This module handles:
- Finding resume points in existing parquet output
- Merging resumed data with existing output (streaming, memory-efficient)
"""
import os
import sys
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pyarrow.compute as pc
def get_resume_point(output_file, partition_namespaces=False):
"""
Find the resume point(s) from existing parquet output.
Args:
output_file: Path to the output file. For single files, this is the parquet file path.
For partitioned namespaces, this is the path like dir/dump.parquet where
namespace=* subdirectories are in the parent dir.
partition_namespaces: Whether the output uses namespace partitioning.
Returns:
For single files: A tuple (pageid, revid) for the row with the highest pageid,
or None if not found.
For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
or None if no partitions exist.
"""
try:
if partition_namespaces:
return _get_resume_point_partitioned(output_file)
else:
return _get_resume_point_single_file(output_file)
except Exception as e:
print(f"Error reading resume point from {output_file}: {e}", file=sys.stderr)
return None
def _get_resume_point_partitioned(output_file):
"""Find per-namespace resume points from partitioned output.
Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition.
This allows resume to correctly handle cases where different namespaces have
different progress due to interleaved dump ordering.
"""
partition_dir = os.path.dirname(output_file)
if not os.path.exists(partition_dir) or not os.path.isdir(partition_dir):
return None
namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
if not namespace_dirs:
return None
resume_points = {}
for ns_dir in namespace_dirs:
ns = int(ns_dir.split('=')[1])
ns_path = os.path.join(partition_dir, ns_dir)
# Find parquet files in this namespace directory
parquet_files = [f for f in os.listdir(ns_path) if f.endswith('.parquet')]
if not parquet_files:
continue
# Read all parquet files in this namespace
for pq_file in parquet_files:
pq_path = os.path.join(ns_path, pq_file)
try:
pf = pq.ParquetFile(pq_path)
table = pf.read(columns=['articleid', 'revid'])
if table.num_rows == 0:
continue
max_pageid = pc.max(table['articleid']).as_py()
mask = pc.equal(table['articleid'], max_pageid)
max_revid = pc.max(pc.filter(table['revid'], mask)).as_py()
# Keep the highest pageid for this namespace
if ns not in resume_points or max_pageid > resume_points[ns][0]:
resume_points[ns] = (max_pageid, max_revid)
except Exception as e:
print(f"Warning: Could not read {pq_path}: {e}", file=sys.stderr)
continue
return resume_points if resume_points else None
def _get_resume_point_single_file(output_file):
"""Find resume point from a single parquet file."""
if not os.path.exists(output_file):
return None
if os.path.isdir(output_file):
return None
# Find the row with the highest pageid
pf = pq.ParquetFile(output_file)
table = pf.read(columns=['articleid', 'revid'])
if table.num_rows == 0:
return None
max_pageid = pc.max(table['articleid']).as_py()
# Filter to row(s) with max pageid and get max revid
mask = pc.equal(table['articleid'], max_pageid)
max_revid = pc.max(pc.filter(table['revid'], mask)).as_py()
return (max_pageid, max_revid)
def merge_parquet_files(original_path, temp_path, merged_path):
"""
Merge two parquet files by streaming row groups from original and temp into merged.
This is memory-efficient: only one row group is loaded at a time.
Returns True if merged file was created, False if both sources were empty.
"""
original_pq = pq.ParquetFile(original_path)
temp_pq = pq.ParquetFile(temp_path)
merged_writer = None
# Copy all row groups from the original file
for i in range(original_pq.num_row_groups):
row_group = original_pq.read_row_group(i)
if merged_writer is None:
merged_writer = pq.ParquetWriter(
merged_path,
row_group.schema,
flavor="spark"
)
merged_writer.write_table(row_group)
# Append all row groups from the temp file
for i in range(temp_pq.num_row_groups):
row_group = temp_pq.read_row_group(i)
if merged_writer is None:
merged_writer = pq.ParquetWriter(
merged_path,
row_group.schema,
flavor="spark"
)
merged_writer.write_table(row_group)
# Close the writer
if merged_writer is not None:
merged_writer.close()
return True
return False
def merge_partitioned_namespaces(partition_dir, temp_suffix):
"""
Merge partitioned namespace directories after resume.
For partitioned namespaces, temp files are written alongside the original files
in each namespace directory with the temp suffix appended to the filename.
E.g., original: namespace=0/file.parquet, temp: namespace=0/file.parquet.resume_temp
Args:
partition_dir: The partition directory containing namespace=* subdirs
temp_suffix: The suffix appended to temp files (e.g., '.resume_temp')
"""
namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
for ns_dir in namespace_dirs:
ns_path = os.path.join(partition_dir, ns_dir)
# Find all files in this namespace directory
files = os.listdir(ns_path)
# Find temp files (files ending with the temp suffix)
temp_files = [f for f in files if f.endswith(temp_suffix)]
for temp_file in temp_files:
temp_path = os.path.join(ns_path, temp_file)
# Original file is the temp file without the suffix
original_file = temp_file[:-len(temp_suffix)]
original_path = os.path.join(ns_path, original_file)
if os.path.exists(original_path):
# Merge the files
merged_path = original_path + ".merged"
merged = merge_parquet_files(original_path, temp_path, merged_path)
if merged:
# Replace the original file with the merged file
os.remove(original_path)
os.rename(merged_path, original_path)
os.remove(temp_path)
else:
# Both files were empty, just remove them
os.remove(original_path)
os.remove(temp_path)
else:
# No original file, rename temp to original
os.rename(temp_path, original_path)
def finalize_resume_merge(
original_output_file,
temp_output_file,
partition_namespaces,
original_partition_dir
):
"""
Finalize the resume by merging temp output with original output.
Args:
original_output_file: Path to the original output file
temp_output_file: Path to the temp output file written during resume
partition_namespaces: Whether using partitioned namespace output
original_partition_dir: The partition directory (for partitioned output)
Raises:
Exception: If merge fails (temp file is preserved for recovery)
"""
import shutil
print("Merging resumed data with existing output...", file=sys.stderr)
try:
if partition_namespaces and original_partition_dir is not None:
# For partitioned namespaces, temp files are written alongside originals
# with '.resume_temp' suffix in each namespace directory.
merge_partitioned_namespaces(original_partition_dir, ".resume_temp")
# Clean up the empty temp directory we created
if os.path.exists(temp_output_file) and os.path.isdir(temp_output_file):
shutil.rmtree(temp_output_file)
else:
# Merge single parquet files
merged_output_file = original_output_file + ".merged"
merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
# Replace the original file with the merged file
os.remove(original_output_file)
os.rename(merged_output_file, original_output_file)
# Clean up the temp file
if os.path.exists(temp_output_file):
os.remove(temp_output_file)
print("Merge complete.", file=sys.stderr)
except Exception as e:
print(f"Error merging resume data: {e}", file=sys.stderr)
print(f"New data saved in: {temp_output_file}", file=sys.stderr)
raise
def setup_resume_temp_output(output_file, partition_namespaces):
"""
Set up temp output for resume mode.
Args:
output_file: The original output file path
partition_namespaces: Whether using partitioned namespace output
Returns:
Tuple of (original_output_file, temp_output_file, original_partition_dir)
or (None, None, None) if no existing output to resume from.
"""
import shutil
original_output_file = None
temp_output_file = None
original_partition_dir = None
# For partitioned namespaces, check if the partition directory exists
if partition_namespaces:
partition_dir = os.path.dirname(output_file)
output_exists = os.path.isdir(partition_dir) and any(
d.startswith('namespace=') for d in os.listdir(partition_dir)
)
if output_exists:
original_partition_dir = partition_dir
else:
output_exists = isinstance(output_file, str) and os.path.exists(output_file)
if output_exists:
original_output_file = output_file
temp_output_file = output_file + ".resume_temp"
# Remove temp file/dir if it exists from a previous failed run
if os.path.exists(temp_output_file):
if os.path.isdir(temp_output_file):
shutil.rmtree(temp_output_file)
else:
os.remove(temp_output_file)
# For partitioned namespaces, create an empty temp directory
# (actual temp files go in namespace=* dirs with .resume_temp suffix)
if partition_namespaces:
os.makedirs(temp_output_file, exist_ok=True)
return original_output_file, temp_output_file, original_partition_dir