handle case when we have a valid resume file, but a corrupted original.

This commit is contained in:
Nathan TeBlunthuis
2025-12-10 20:33:04 -08:00
parent f4a9491ff2
commit c3d31b4ab5
3 changed files with 262 additions and 43 deletions

View File

@@ -1226,27 +1226,33 @@ def main():
# Handle resume functionality before opening input file
resume_point = None
start_fresh = False
if args.resume:
if output_parquet and not args.stdout:
# First, merge any leftover temp files from a previous interrupted run
cleanup_interrupted_resume(output_file, args.partition_namespaces)
resume_point = get_resume_point(output_file, args.partition_namespaces)
if resume_point is not None:
if args.partition_namespaces:
ns_list = sorted(resume_point.keys())
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
for ns in ns_list:
pageid, revid = resume_point[ns]
print(f" namespace={ns}: pageid={pageid}, revid={revid}", file=sys.stderr)
else:
pageid, revid = resume_point
print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr)
cleanup_result = cleanup_interrupted_resume(output_file, args.partition_namespaces)
if cleanup_result == "start_fresh":
# All data was corrupted, start from beginning
start_fresh = True
print("Starting fresh due to data corruption.", file=sys.stderr)
else:
if args.partition_namespaces:
partition_dir = os.path.dirname(output_file)
sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}")
resume_point = get_resume_point(output_file, args.partition_namespaces)
if resume_point is not None:
if args.partition_namespaces:
ns_list = sorted(resume_point.keys())
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
for ns in ns_list:
pageid, revid = resume_point[ns]
print(f" namespace={ns}: pageid={pageid}, revid={revid}", file=sys.stderr)
else:
pageid, revid = resume_point
print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr)
else:
sys.exit(f"Error: --resume specified but output file not found: {output_file}")
if args.partition_namespaces:
partition_dir = os.path.dirname(output_file)
sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}")
else:
sys.exit(f"Error: --resume specified but output file not found: {output_file}")
else:
sys.exit("Error: --resume only works with parquet output (not stdout or TSV)")