diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 153f2fa..98ef4dc 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -1248,11 +1248,32 @@ def main(): pageid, revid = resume_point print(f"Resuming from last written point: pageid={pageid}, revid={revid}", file=sys.stderr) else: + # resume_point is None - check if file exists but is corrupt if args.partition_namespaces: partition_dir = os.path.dirname(output_file) - sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}") + output_filename = os.path.basename(output_file) + corrupt_files = [] + if os.path.isdir(partition_dir): + for d in os.listdir(partition_dir): + if d.startswith('namespace='): + filepath = os.path.join(partition_dir, d, output_filename) + if os.path.exists(filepath): + corrupt_files.append(filepath) + if corrupt_files: + print("Output files exist but are corrupt, deleting and starting fresh.", file=sys.stderr) + for filepath in corrupt_files: + os.remove(filepath) + start_fresh = True + else: + sys.exit(f"Error: --resume specified but partitioned output not found in: {partition_dir}") else: - sys.exit(f"Error: --resume specified but output file not found: {output_file}") + if os.path.exists(output_file): + # File exists but is corrupt - start fresh + print(f"Output file {output_file} exists but is corrupt, starting fresh.", file=sys.stderr) + os.remove(output_file) + start_fresh = True + else: + sys.exit(f"Error: --resume specified but output file not found: {output_file}") else: sys.exit("Error: --resume only works with parquet output (not stdout or TSV)")