Add per-namespace resume support for partitioned parquet output.
- Implement per-namespace resume points (dict mapping namespace -> (pageid, revid)) to correctly handle interleaved dump ordering in partitioned output - Extract resume functionality to dedicated resume.py module - Add graceful shutdown handling via shutdown_requested flag (CLI-level only) - Use lazy ParquetWriter creation to avoid empty files on early exit - Refactor writing logic to _write_batch() helper method - Simplify control flow by replacing continue statements with should_write flag
This commit is contained in:
296
src/wikiq/resume.py
Normal file
296
src/wikiq/resume.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""
|
||||
Checkpoint and resume functionality for wikiq parquet output.
|
||||
|
||||
This module handles:
|
||||
- Finding resume points in existing parquet output
|
||||
- Merging resumed data with existing output (streaming, memory-efficient)
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pyarrow.dataset as ds
|
||||
import pyarrow.parquet as pq
|
||||
import pyarrow.compute as pc
|
||||
|
||||
|
||||
def get_resume_point(output_file, partition_namespaces=False):
|
||||
"""
|
||||
Find the resume point(s) from existing parquet output.
|
||||
|
||||
Args:
|
||||
output_file: Path to the output file. For single files, this is the parquet file path.
|
||||
For partitioned namespaces, this is the path like dir/dump.parquet where
|
||||
namespace=* subdirectories are in the parent dir.
|
||||
partition_namespaces: Whether the output uses namespace partitioning.
|
||||
|
||||
Returns:
|
||||
For single files: A tuple (pageid, revid) for the row with the highest pageid,
|
||||
or None if not found.
|
||||
For partitioned: A dict mapping namespace -> (pageid, revid) for each partition,
|
||||
or None if no partitions exist.
|
||||
"""
|
||||
try:
|
||||
if partition_namespaces:
|
||||
return _get_resume_point_partitioned(output_file)
|
||||
else:
|
||||
return _get_resume_point_single_file(output_file)
|
||||
except Exception as e:
|
||||
print(f"Error reading resume point from {output_file}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _get_resume_point_partitioned(output_file):
|
||||
"""Find per-namespace resume points from partitioned output.
|
||||
|
||||
Returns a dict mapping namespace -> (max_pageid, max_revid) for each partition.
|
||||
This allows resume to correctly handle cases where different namespaces have
|
||||
different progress due to interleaved dump ordering.
|
||||
"""
|
||||
partition_dir = os.path.dirname(output_file)
|
||||
if not os.path.exists(partition_dir) or not os.path.isdir(partition_dir):
|
||||
return None
|
||||
|
||||
namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
|
||||
if not namespace_dirs:
|
||||
return None
|
||||
|
||||
resume_points = {}
|
||||
for ns_dir in namespace_dirs:
|
||||
ns = int(ns_dir.split('=')[1])
|
||||
ns_path = os.path.join(partition_dir, ns_dir)
|
||||
|
||||
# Find parquet files in this namespace directory
|
||||
parquet_files = [f for f in os.listdir(ns_path) if f.endswith('.parquet')]
|
||||
if not parquet_files:
|
||||
continue
|
||||
|
||||
# Read all parquet files in this namespace
|
||||
for pq_file in parquet_files:
|
||||
pq_path = os.path.join(ns_path, pq_file)
|
||||
try:
|
||||
pf = pq.ParquetFile(pq_path)
|
||||
table = pf.read(columns=['articleid', 'revid'])
|
||||
if table.num_rows == 0:
|
||||
continue
|
||||
|
||||
max_pageid = pc.max(table['articleid']).as_py()
|
||||
mask = pc.equal(table['articleid'], max_pageid)
|
||||
max_revid = pc.max(pc.filter(table['revid'], mask)).as_py()
|
||||
|
||||
# Keep the highest pageid for this namespace
|
||||
if ns not in resume_points or max_pageid > resume_points[ns][0]:
|
||||
resume_points[ns] = (max_pageid, max_revid)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not read {pq_path}: {e}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
return resume_points if resume_points else None
|
||||
|
||||
|
||||
def _get_resume_point_single_file(output_file):
|
||||
"""Find resume point from a single parquet file."""
|
||||
if not os.path.exists(output_file):
|
||||
return None
|
||||
|
||||
if os.path.isdir(output_file):
|
||||
return None
|
||||
|
||||
# Find the row with the highest pageid
|
||||
pf = pq.ParquetFile(output_file)
|
||||
table = pf.read(columns=['articleid', 'revid'])
|
||||
|
||||
if table.num_rows == 0:
|
||||
return None
|
||||
|
||||
max_pageid = pc.max(table['articleid']).as_py()
|
||||
# Filter to row(s) with max pageid and get max revid
|
||||
mask = pc.equal(table['articleid'], max_pageid)
|
||||
max_revid = pc.max(pc.filter(table['revid'], mask)).as_py()
|
||||
return (max_pageid, max_revid)
|
||||
|
||||
|
||||
def merge_parquet_files(original_path, temp_path, merged_path):
|
||||
"""
|
||||
Merge two parquet files by streaming row groups from original and temp into merged.
|
||||
|
||||
This is memory-efficient: only one row group is loaded at a time.
|
||||
Returns True if merged file was created, False if both sources were empty.
|
||||
"""
|
||||
original_pq = pq.ParquetFile(original_path)
|
||||
temp_pq = pq.ParquetFile(temp_path)
|
||||
|
||||
merged_writer = None
|
||||
|
||||
# Copy all row groups from the original file
|
||||
for i in range(original_pq.num_row_groups):
|
||||
row_group = original_pq.read_row_group(i)
|
||||
if merged_writer is None:
|
||||
merged_writer = pq.ParquetWriter(
|
||||
merged_path,
|
||||
row_group.schema,
|
||||
flavor="spark"
|
||||
)
|
||||
merged_writer.write_table(row_group)
|
||||
|
||||
# Append all row groups from the temp file
|
||||
for i in range(temp_pq.num_row_groups):
|
||||
row_group = temp_pq.read_row_group(i)
|
||||
if merged_writer is None:
|
||||
merged_writer = pq.ParquetWriter(
|
||||
merged_path,
|
||||
row_group.schema,
|
||||
flavor="spark"
|
||||
)
|
||||
merged_writer.write_table(row_group)
|
||||
|
||||
# Close the writer
|
||||
if merged_writer is not None:
|
||||
merged_writer.close()
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def merge_partitioned_namespaces(partition_dir, temp_suffix):
|
||||
"""
|
||||
Merge partitioned namespace directories after resume.
|
||||
|
||||
For partitioned namespaces, temp files are written alongside the original files
|
||||
in each namespace directory with the temp suffix appended to the filename.
|
||||
E.g., original: namespace=0/file.parquet, temp: namespace=0/file.parquet.resume_temp
|
||||
|
||||
Args:
|
||||
partition_dir: The partition directory containing namespace=* subdirs
|
||||
temp_suffix: The suffix appended to temp files (e.g., '.resume_temp')
|
||||
"""
|
||||
namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
|
||||
|
||||
for ns_dir in namespace_dirs:
|
||||
ns_path = os.path.join(partition_dir, ns_dir)
|
||||
|
||||
# Find all files in this namespace directory
|
||||
files = os.listdir(ns_path)
|
||||
|
||||
# Find temp files (files ending with the temp suffix)
|
||||
temp_files = [f for f in files if f.endswith(temp_suffix)]
|
||||
|
||||
for temp_file in temp_files:
|
||||
temp_path = os.path.join(ns_path, temp_file)
|
||||
# Original file is the temp file without the suffix
|
||||
original_file = temp_file[:-len(temp_suffix)]
|
||||
original_path = os.path.join(ns_path, original_file)
|
||||
|
||||
if os.path.exists(original_path):
|
||||
# Merge the files
|
||||
merged_path = original_path + ".merged"
|
||||
merged = merge_parquet_files(original_path, temp_path, merged_path)
|
||||
|
||||
if merged:
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_path)
|
||||
os.rename(merged_path, original_path)
|
||||
os.remove(temp_path)
|
||||
else:
|
||||
# Both files were empty, just remove them
|
||||
os.remove(original_path)
|
||||
os.remove(temp_path)
|
||||
else:
|
||||
# No original file, rename temp to original
|
||||
os.rename(temp_path, original_path)
|
||||
|
||||
|
||||
def finalize_resume_merge(
|
||||
original_output_file,
|
||||
temp_output_file,
|
||||
partition_namespaces,
|
||||
original_partition_dir
|
||||
):
|
||||
"""
|
||||
Finalize the resume by merging temp output with original output.
|
||||
|
||||
Args:
|
||||
original_output_file: Path to the original output file
|
||||
temp_output_file: Path to the temp output file written during resume
|
||||
partition_namespaces: Whether using partitioned namespace output
|
||||
original_partition_dir: The partition directory (for partitioned output)
|
||||
|
||||
Raises:
|
||||
Exception: If merge fails (temp file is preserved for recovery)
|
||||
"""
|
||||
import shutil
|
||||
|
||||
print("Merging resumed data with existing output...", file=sys.stderr)
|
||||
try:
|
||||
if partition_namespaces and original_partition_dir is not None:
|
||||
# For partitioned namespaces, temp files are written alongside originals
|
||||
# with '.resume_temp' suffix in each namespace directory.
|
||||
merge_partitioned_namespaces(original_partition_dir, ".resume_temp")
|
||||
# Clean up the empty temp directory we created
|
||||
if os.path.exists(temp_output_file) and os.path.isdir(temp_output_file):
|
||||
shutil.rmtree(temp_output_file)
|
||||
else:
|
||||
# Merge single parquet files
|
||||
merged_output_file = original_output_file + ".merged"
|
||||
merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
|
||||
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_output_file)
|
||||
os.rename(merged_output_file, original_output_file)
|
||||
|
||||
# Clean up the temp file
|
||||
if os.path.exists(temp_output_file):
|
||||
os.remove(temp_output_file)
|
||||
|
||||
print("Merge complete.", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"Error merging resume data: {e}", file=sys.stderr)
|
||||
print(f"New data saved in: {temp_output_file}", file=sys.stderr)
|
||||
raise
|
||||
|
||||
|
||||
def setup_resume_temp_output(output_file, partition_namespaces):
|
||||
"""
|
||||
Set up temp output for resume mode.
|
||||
|
||||
Args:
|
||||
output_file: The original output file path
|
||||
partition_namespaces: Whether using partitioned namespace output
|
||||
|
||||
Returns:
|
||||
Tuple of (original_output_file, temp_output_file, original_partition_dir)
|
||||
or (None, None, None) if no existing output to resume from.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
original_output_file = None
|
||||
temp_output_file = None
|
||||
original_partition_dir = None
|
||||
|
||||
# For partitioned namespaces, check if the partition directory exists
|
||||
if partition_namespaces:
|
||||
partition_dir = os.path.dirname(output_file)
|
||||
output_exists = os.path.isdir(partition_dir) and any(
|
||||
d.startswith('namespace=') for d in os.listdir(partition_dir)
|
||||
)
|
||||
if output_exists:
|
||||
original_partition_dir = partition_dir
|
||||
else:
|
||||
output_exists = isinstance(output_file, str) and os.path.exists(output_file)
|
||||
|
||||
if output_exists:
|
||||
original_output_file = output_file
|
||||
temp_output_file = output_file + ".resume_temp"
|
||||
|
||||
# Remove temp file/dir if it exists from a previous failed run
|
||||
if os.path.exists(temp_output_file):
|
||||
if os.path.isdir(temp_output_file):
|
||||
shutil.rmtree(temp_output_file)
|
||||
else:
|
||||
os.remove(temp_output_file)
|
||||
|
||||
# For partitioned namespaces, create an empty temp directory
|
||||
# (actual temp files go in namespace=* dirs with .resume_temp suffix)
|
||||
if partition_namespaces:
|
||||
os.makedirs(temp_output_file, exist_ok=True)
|
||||
|
||||
return original_output_file, temp_output_file, original_partition_dir
|
||||
Reference in New Issue
Block a user