add logic for resuming after a resume.
This commit is contained in:
@@ -34,6 +34,7 @@ from wikiq.resume import (
|
||||
setup_resume_temp_output,
|
||||
finalize_resume_merge,
|
||||
get_checkpoint_path,
|
||||
cleanup_interrupted_resume,
|
||||
)
|
||||
|
||||
TO_ENCODE = ("title", "editor")
|
||||
@@ -1227,6 +1228,8 @@ def main():
|
||||
resume_point = None
|
||||
if args.resume:
|
||||
if output_parquet and not args.stdout:
|
||||
# First, merge any leftover temp files from a previous interrupted run
|
||||
cleanup_interrupted_resume(output_file, args.partition_namespaces)
|
||||
resume_point = get_resume_point(output_file, args.partition_namespaces)
|
||||
if resume_point is not None:
|
||||
if args.partition_namespaces:
|
||||
|
||||
@@ -14,6 +14,54 @@ import sys
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
|
||||
def cleanup_interrupted_resume(output_file, partition_namespaces):
|
||||
"""
|
||||
Merge any leftover .resume_temp files from a previous interrupted run.
|
||||
|
||||
This should be called BEFORE get_resume_point() so the resume point
|
||||
is calculated from the merged data.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
if partition_namespaces:
|
||||
partition_dir = os.path.dirname(output_file)
|
||||
output_filename = os.path.basename(output_file)
|
||||
temp_suffix = ".resume_temp"
|
||||
|
||||
if not os.path.isdir(partition_dir):
|
||||
return
|
||||
|
||||
has_old_temp_files = False
|
||||
for ns_dir in os.listdir(partition_dir):
|
||||
if ns_dir.startswith('namespace='):
|
||||
temp_path = os.path.join(partition_dir, ns_dir, output_filename + temp_suffix)
|
||||
if os.path.exists(temp_path):
|
||||
has_old_temp_files = True
|
||||
break
|
||||
|
||||
if has_old_temp_files:
|
||||
print("Found leftover temp files from previous interrupted partitioned run, merging first...", file=sys.stderr)
|
||||
merge_partitioned_namespaces(partition_dir, temp_suffix)
|
||||
print("Previous temp files merged successfully.", file=sys.stderr)
|
||||
else:
|
||||
temp_output_file = output_file + ".resume_temp"
|
||||
if os.path.exists(temp_output_file) and not os.path.isdir(temp_output_file):
|
||||
print("Found leftover temp file from previous interrupted run, merging first...", file=sys.stderr)
|
||||
merged_path = output_file + ".merged"
|
||||
merged = merge_parquet_files(output_file, temp_output_file, merged_path)
|
||||
if merged is None:
|
||||
# Temp file was invalid, just remove it
|
||||
os.remove(temp_output_file)
|
||||
elif merged:
|
||||
os.remove(output_file)
|
||||
os.rename(merged_path, output_file)
|
||||
os.remove(temp_output_file)
|
||||
print("Previous temp file merged successfully.", file=sys.stderr)
|
||||
else:
|
||||
# Both empty - unusual
|
||||
os.remove(temp_output_file)
|
||||
|
||||
|
||||
def get_checkpoint_path(output_file, partition_namespaces=False):
|
||||
"""Get the path to the checkpoint file for a given output file.
|
||||
|
||||
@@ -183,10 +231,19 @@ def merge_parquet_files(original_path, temp_path, merged_path):
|
||||
Merge two parquet files by streaming row groups from original and temp into merged.
|
||||
|
||||
This is memory-efficient: only one row group is loaded at a time.
|
||||
Returns True if merged file was created, False if both sources were empty.
|
||||
Returns:
|
||||
True if merged file was created
|
||||
False if both sources were empty
|
||||
None if temp file is invalid (caller should keep original unchanged)
|
||||
"""
|
||||
original_pq = pq.ParquetFile(original_path)
|
||||
temp_pq = pq.ParquetFile(temp_path)
|
||||
|
||||
try:
|
||||
temp_pq = pq.ParquetFile(temp_path)
|
||||
except Exception:
|
||||
# Temp file is invalid (empty or corrupted) - no new data was written
|
||||
print(f"Note: No new data in temp file (namespace had no records after resume point)", file=sys.stderr)
|
||||
return None
|
||||
|
||||
merged_writer = None
|
||||
|
||||
@@ -253,7 +310,10 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
|
||||
merged_path = original_path + ".merged"
|
||||
merged = merge_parquet_files(original_path, temp_path, merged_path)
|
||||
|
||||
if merged:
|
||||
if merged is None:
|
||||
# Temp file was invalid (no new data), keep original unchanged
|
||||
os.remove(temp_path)
|
||||
elif merged:
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_path)
|
||||
os.rename(merged_path, original_path)
|
||||
@@ -263,8 +323,13 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
|
||||
os.remove(original_path)
|
||||
os.remove(temp_path)
|
||||
else:
|
||||
# No original file, rename temp to original
|
||||
os.rename(temp_path, original_path)
|
||||
# No original file, rename temp to original only if valid
|
||||
try:
|
||||
pq.ParquetFile(temp_path)
|
||||
os.rename(temp_path, original_path)
|
||||
except Exception:
|
||||
# Temp file invalid, just remove it
|
||||
os.remove(temp_path)
|
||||
|
||||
|
||||
def finalize_resume_merge(
|
||||
@@ -299,15 +364,23 @@ def finalize_resume_merge(
|
||||
else:
|
||||
# Merge single parquet files
|
||||
merged_output_file = original_output_file + ".merged"
|
||||
merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
|
||||
merged = merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
|
||||
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_output_file)
|
||||
os.rename(merged_output_file, original_output_file)
|
||||
|
||||
# Clean up the temp file
|
||||
if os.path.exists(temp_output_file):
|
||||
os.remove(temp_output_file)
|
||||
if merged is None:
|
||||
# Temp file was invalid (no new data), keep original unchanged
|
||||
if os.path.exists(temp_output_file):
|
||||
os.remove(temp_output_file)
|
||||
elif merged:
|
||||
# Replace the original file with the merged file
|
||||
os.remove(original_output_file)
|
||||
os.rename(merged_output_file, original_output_file)
|
||||
if os.path.exists(temp_output_file):
|
||||
os.remove(temp_output_file)
|
||||
else:
|
||||
# Both files were empty - unusual, but clean up
|
||||
os.remove(original_output_file)
|
||||
if os.path.exists(temp_output_file):
|
||||
os.remove(temp_output_file)
|
||||
|
||||
print("Merge complete.", file=sys.stderr)
|
||||
except Exception as e:
|
||||
@@ -354,15 +427,15 @@ def setup_resume_temp_output(output_file, partition_namespaces):
|
||||
original_output_file = output_file
|
||||
temp_output_file = output_file + ".resume_temp"
|
||||
|
||||
# Remove temp file/dir if it exists from a previous failed run
|
||||
# Note: cleanup_interrupted_resume() should have been called before this
|
||||
# to merge any leftover temp files from a previous interrupted run.
|
||||
# Here we just clean up any remaining temp directory markers.
|
||||
if os.path.exists(temp_output_file):
|
||||
if os.path.isdir(temp_output_file):
|
||||
shutil.rmtree(temp_output_file)
|
||||
else:
|
||||
os.remove(temp_output_file)
|
||||
|
||||
# For partitioned namespaces, create an empty temp directory
|
||||
# (actual temp files go in namespace=* dirs with .resume_temp suffix)
|
||||
if partition_namespaces:
|
||||
os.makedirs(temp_output_file, exist_ok=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user