handle case when we have a valid resume file, but a corrupted original.
This commit is contained in:
@@ -1226,27 +1226,33 @@ def main():
|
||||
|
||||
# Handle resume functionality before opening input file
|
||||
resume_point = None
|
||||
start_fresh = False
|
||||
if args.resume:
|
||||
if output_parquet and not args.stdout:
|
||||
# First, merge any leftover temp files from a previous interrupted run
|
||||
cleanup_interrupted_resume(output_file, args.partition_namespaces)
|
||||
resume_point = get_resume_point(output_file, args.partition_namespaces)
|
||||
if resume_point is not None:
|
||||
if args.partition_namespaces:
|
||||
ns_list = sorted(resume_point.keys())
|
||||
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
|
||||
for ns in ns_list:
|
||||
pageid, revid = resume_point[ns]
|
||||
print(f" namespace={ns}: pageid={pageid}, revid={revid}", file=sys.stderr)
|
||||
else:
|
||||
pageid, revid = resume_point
|
||||
print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr)
|
||||
cleanup_result = cleanup_interrupted_resume(output_file, args.partition_namespaces)
|
||||
if cleanup_result == "start_fresh":
|
||||
# All data was corrupted, start from beginning
|
||||
start_fresh = True
|
||||
print("Starting fresh due to data corruption.", file=sys.stderr)
|
||||
else:
|
||||
if args.partition_namespaces:
|
||||
partition_dir = os.path.dirname(output_file)
|
||||
sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}")
|
||||
resume_point = get_resume_point(output_file, args.partition_namespaces)
|
||||
if resume_point is not None:
|
||||
if args.partition_namespaces:
|
||||
ns_list = sorted(resume_point.keys())
|
||||
print(f"Resuming with per-namespace resume points for {len(ns_list)} namespaces", file=sys.stderr)
|
||||
for ns in ns_list:
|
||||
pageid, revid = resume_point[ns]
|
||||
print(f" namespace={ns}: pageid={pageid}, revid={revid}", file=sys.stderr)
|
||||
else:
|
||||
pageid, revid = resume_point
|
||||
print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr)
|
||||
else:
|
||||
sys.exit(f"Error: --resume specified but output file not found: {output_file}")
|
||||
if args.partition_namespaces:
|
||||
partition_dir = os.path.dirname(output_file)
|
||||
sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}")
|
||||
else:
|
||||
sys.exit(f"Error: --resume specified but output file not found: {output_file}")
|
||||
else:
|
||||
sys.exit("Error: --resume only works with parquet output (not stdout or TSV)")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user