diff --git a/src/wikiq/resume.py b/src/wikiq/resume.py index 2c93a86..5af9e3c 100644 --- a/src/wikiq/resume.py +++ b/src/wikiq/resume.py @@ -292,10 +292,13 @@ def merge_parquet_files(original_path, temp_path, merged_path): print(f"Warning: Original file {original_path} is corrupted or invalid: {e}", file=sys.stderr) try: - temp_pq = pq.ParquetFile(temp_path) - temp_valid = True + if not os.path.exists(temp_path): + print(f"Note: Temp file {temp_path} does not exist (namespace had no records after resume point)", file=sys.stderr) + else: + temp_pq = pq.ParquetFile(temp_path) + temp_valid = True except Exception: - print(f"Note: No new data in temp file {temp_path} (namespace had no records after resume point)", file=sys.stderr) + print(f"Note: No new data in temp file {temp_path} (file exists but is invalid)", file=sys.stderr) if not original_valid and not temp_valid: print(f"Both original and temp files are invalid, will start fresh", file=sys.stderr) @@ -380,33 +383,40 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix): if merged == "original_only": # Temp file was invalid (no new data), keep original unchanged - os.remove(temp_path) + if os.path.exists(temp_path): + os.remove(temp_path) elif merged == "temp_only": # Original was corrupted, use temp as new base os.remove(original_path) os.rename(temp_path, original_path) elif merged == "both_invalid": # Both files corrupted, remove both - os.remove(original_path) - os.remove(temp_path) + if os.path.exists(original_path): + os.remove(original_path) + if os.path.exists(temp_path): + os.remove(temp_path) had_corruption = True elif merged == "merged": # Replace the original file with the merged file os.remove(original_path) os.rename(merged_path, original_path) - os.remove(temp_path) + if os.path.exists(temp_path): + os.remove(temp_path) else: # Both files were empty (False), just remove them - os.remove(original_path) - os.remove(temp_path) + if os.path.exists(original_path): + os.remove(original_path) + if os.path.exists(temp_path): + os.remove(temp_path) else: # No original file, rename temp to original only if valid try: pq.ParquetFile(temp_path) os.rename(temp_path, original_path) except Exception: - # Temp file invalid, just remove it - os.remove(temp_path) + # Temp file invalid or missing, just remove it if it exists + if os.path.exists(temp_path): + os.remove(temp_path) had_corruption = True return had_corruption