handle case when we have a valid resume file, but a corrupted original.

This commit is contained in:
Nathan TeBlunthuis 2025-12-10 20:33:04 -08:00
parent f4a9491ff2
commit c3d31b4ab5
3 changed files with 262 additions and 43 deletions

View File

@ -1226,10 +1226,16 @@ def main():
# Handle resume functionality before opening input file
resume_point = None
start_fresh = False
if args.resume:
if output_parquet and not args.stdout:
# First, merge any leftover temp files from a previous interrupted run
cleanup_interrupted_resume(output_file, args.partition_namespaces)
cleanup_result = cleanup_interrupted_resume(output_file, args.partition_namespaces)
if cleanup_result == "start_fresh":
# All data was corrupted, start from beginning
start_fresh = True
print("Starting fresh due to data corruption.", file=sys.stderr)
else:
resume_point = get_resume_point(output_file, args.partition_namespaces)
if resume_point is not None:
if args.partition_namespaces:

View File

@ -20,6 +20,10 @@ def cleanup_interrupted_resume(output_file, partition_namespaces):
This should be called BEFORE get_resume_point() so the resume point
is calculated from the merged data.
Returns:
None - no temp files found or normal merge completed
"start_fresh" - both original and temp were corrupted and deleted
"""
import shutil
@ -40,19 +44,53 @@ def cleanup_interrupted_resume(output_file, partition_namespaces):
break
if has_old_temp_files:
print("Found leftover temp files from previous interrupted partitioned run, merging first...", file=sys.stderr)
merge_partitioned_namespaces(partition_dir, temp_suffix)
print(f"Found leftover temp files in {partition_dir} from previous interrupted partitioned run, merging first...", file=sys.stderr)
had_corruption = merge_partitioned_namespaces(partition_dir, temp_suffix)
# Check if any valid data remains after merge
has_valid_data = False
for ns_dir in os.listdir(partition_dir):
if ns_dir.startswith('namespace='):
ns_path = os.path.join(partition_dir, ns_dir)
parquet_files = [f for f in os.listdir(ns_path) if f.endswith('.parquet') and not f.endswith('.resume_temp')]
if parquet_files:
has_valid_data = True
break
if had_corruption and not has_valid_data:
# All data was corrupted, remove checkpoint and start fresh
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
if os.path.exists(checkpoint_path):
os.remove(checkpoint_path)
print("All partitioned files were corrupted, will start fresh.", file=sys.stderr)
return "start_fresh"
print("Previous temp files merged successfully.", file=sys.stderr)
else:
temp_output_file = output_file + ".resume_temp"
if os.path.exists(temp_output_file) and not os.path.isdir(temp_output_file):
print("Found leftover temp file from previous interrupted run, merging first...", file=sys.stderr)
print(f"Found leftover temp file {temp_output_file} from previous interrupted run, merging first...", file=sys.stderr)
merged_path = output_file + ".merged"
merged = merge_parquet_files(output_file, temp_output_file, merged_path)
if merged is None:
if merged == "original_only":
# Temp file was invalid, just remove it
os.remove(temp_output_file)
elif merged:
elif merged == "temp_only":
# Original was corrupted, use temp as new base
os.remove(output_file)
os.rename(temp_output_file, output_file)
print("Recovered from temp file (original was corrupted).", file=sys.stderr)
elif merged == "both_invalid":
# Both files corrupted, remove both and start fresh
os.remove(output_file)
os.remove(temp_output_file)
# Also remove stale checkpoint file
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
if os.path.exists(checkpoint_path):
os.remove(checkpoint_path)
print("Both files were corrupted, will start fresh.", file=sys.stderr)
return "start_fresh"
elif merged == "merged":
os.remove(output_file)
os.rename(merged_path, output_file)
os.remove(temp_output_file)
@ -138,13 +176,14 @@ def get_resume_point(output_file, partition_namespaces=False):
or None if no partitions exist.
"""
# First try checkpoint file (fast)
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
checkpoint_result = read_checkpoint(output_file, partition_namespaces)
if checkpoint_result is not None:
print(f"Resume point found in checkpoint file", file=sys.stderr)
print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr)
return checkpoint_result
# Fall back to scanning parquet (slow, for backwards compatibility)
print(f"No checkpoint file found, scanning parquet output...", file=sys.stderr)
print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr)
try:
if partition_namespaces:
return _get_resume_point_partitioned(output_file)
@ -232,24 +271,39 @@ def merge_parquet_files(original_path, temp_path, merged_path):
This is memory-efficient: only one row group is loaded at a time.
Returns:
True if merged file was created
False if both sources were empty
None if temp file is invalid (caller should keep original unchanged)
"merged" - merged file was created from both sources
"original_only" - temp was invalid, keep original unchanged
"temp_only" - original was corrupted but temp is valid, use temp
"both_invalid" - both files invalid, delete both and start fresh
False - both files were valid but empty
"""
original_valid = False
temp_valid = False
original_pq = None
temp_pq = None
try:
original_pq = pq.ParquetFile(original_path)
except Exception:
# Original file is invalid (empty or corrupted)
print(f"Note: No data in original file {original_path} (namespace had no records or file was not properly written)", file=sys.stderr)
return None
original_valid = True
except Exception as e:
print(f"Warning: Original file {original_path} is corrupted or invalid: {e}", file=sys.stderr)
try:
temp_pq = pq.ParquetFile(temp_path)
temp_valid = True
except Exception:
# Temp file is invalid (empty or corrupted) - no new data was written
print(f"Note: No new data in temp file {temp_path} (namespace had no records after resume point)", file=sys.stderr)
return None
if not original_valid and not temp_valid:
print(f"Both original and temp files are invalid, will start fresh", file=sys.stderr)
return "both_invalid"
if not original_valid and temp_valid:
print(f"Original file corrupted but temp file is valid, recovering from temp", file=sys.stderr)
return "temp_only"
if original_valid and not temp_valid:
return "original_only"
merged_writer = None
@ -278,7 +332,7 @@ def merge_parquet_files(original_path, temp_path, merged_path):
# Close the writer
if merged_writer is not None:
merged_writer.close()
return True
return "merged"
return False
@ -293,8 +347,13 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
Args:
partition_dir: The partition directory containing namespace=* subdirs
temp_suffix: The suffix appended to temp files (e.g., '.resume_temp')
Returns:
True if at least one namespace has valid data after merge
False if all namespaces ended up with corrupted/deleted data
"""
namespace_dirs = [d for d in os.listdir(partition_dir) if d.startswith('namespace=')]
had_corruption = False
for ns_dir in namespace_dirs:
ns_path = os.path.join(partition_dir, ns_dir)
@ -316,16 +375,25 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
merged_path = original_path + ".merged"
merged = merge_parquet_files(original_path, temp_path, merged_path)
if merged is None:
if merged == "original_only":
# Temp file was invalid (no new data), keep original unchanged
os.remove(temp_path)
elif merged:
elif merged == "temp_only":
# Original was corrupted, use temp as new base
os.remove(original_path)
os.rename(temp_path, original_path)
elif merged == "both_invalid":
# Both files corrupted, remove both
os.remove(original_path)
os.remove(temp_path)
had_corruption = True
elif merged == "merged":
# Replace the original file with the merged file
os.remove(original_path)
os.rename(merged_path, original_path)
os.remove(temp_path)
else:
# Both files were empty, just remove them
# Both files were empty (False), just remove them
os.remove(original_path)
os.remove(temp_path)
else:
@ -336,6 +404,9 @@ def merge_partitioned_namespaces(partition_dir, temp_suffix):
except Exception:
# Temp file invalid, just remove it
os.remove(temp_path)
had_corruption = True
return had_corruption
def finalize_resume_merge(
@ -372,25 +443,34 @@ def finalize_resume_merge(
merged_output_file = original_output_file + ".merged"
merged = merge_parquet_files(original_output_file, temp_output_file, merged_output_file)
if merged is None:
if merged == "original_only":
# Temp file was invalid (no new data), keep original unchanged
if os.path.exists(temp_output_file):
os.remove(temp_output_file)
elif merged:
elif merged == "temp_only":
# Original was corrupted, use temp as new base
os.remove(original_output_file)
os.rename(temp_output_file, original_output_file)
elif merged == "both_invalid":
# Both files corrupted, remove both
os.remove(original_output_file)
if os.path.exists(temp_output_file):
os.remove(temp_output_file)
elif merged == "merged":
# Replace the original file with the merged file
os.remove(original_output_file)
os.rename(merged_output_file, original_output_file)
if os.path.exists(temp_output_file):
os.remove(temp_output_file)
else:
# Both files were empty - unusual, but clean up
# Both files were empty (False) - unusual, but clean up
os.remove(original_output_file)
if os.path.exists(temp_output_file):
os.remove(temp_output_file)
print("Merge complete.", file=sys.stderr)
except Exception as e:
print(f"Error merging resume data: {e}", file=sys.stderr)
print(f"Error merging resume data for {original_output_file}: {e}", file=sys.stderr)
print(f"New data saved in: {temp_output_file}", file=sys.stderr)
raise

View File

@ -1070,9 +1070,9 @@ def test_resume_merge_with_invalid_temp_file():
with open(temp_path, 'w') as f:
f.write("")
# merge_parquet_files should return None for invalid temp file
# merge_parquet_files should return "original_only" for invalid temp file
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result is None, "Expected None when temp file is invalid"
assert result == "original_only", f"Expected 'original_only' when temp file is invalid, got {result}"
# Original file should still exist and be unchanged
assert os.path.exists(original_path), "Original file should still exist"
@ -1083,3 +1083,136 @@ def test_resume_merge_with_invalid_temp_file():
assert not os.path.exists(merged_path), "Merged file should not be created"
print("Resume merge with invalid temp file test passed!")
def test_resume_merge_with_corrupted_original():
"""Test that resume recovers from a corrupted original file if temp is valid.
This can happen if the original file was being written when the process
was killed, leaving it in a corrupted state.
"""
import pyarrow.parquet as pq
from wikiq.resume import merge_parquet_files
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
original_path = os.path.join(tmpdir, "original.parquet")
temp_path = os.path.join(tmpdir, "temp.parquet")
merged_path = os.path.join(tmpdir, "merged.parquet")
# Create a corrupted original file (not valid parquet)
with open(original_path, 'w') as f:
f.write("corrupted data")
# Create a valid temp file
import pyarrow as pa
table = pa.table({"articleid": [4, 5, 6], "revid": [40, 50, 60]})
pq.write_table(table, temp_path)
# merge_parquet_files should return "temp_only" for corrupted original
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result == "temp_only", f"Expected 'temp_only' when original is corrupted, got {result}"
# Merged file should not have been created (caller handles renaming temp)
assert not os.path.exists(merged_path), "Merged file should not be created for temp_only case"
print("Resume merge with corrupted original test passed!")
def test_resume_merge_both_invalid():
"""Test that resume handles both files being invalid."""
from wikiq.resume import merge_parquet_files
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
original_path = os.path.join(tmpdir, "original.parquet")
temp_path = os.path.join(tmpdir, "temp.parquet")
merged_path = os.path.join(tmpdir, "merged.parquet")
# Create corrupted original file
with open(original_path, 'w') as f:
f.write("corrupted original")
# Create corrupted temp file
with open(temp_path, 'w') as f:
f.write("corrupted temp")
# merge_parquet_files should return "both_invalid"
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result == "both_invalid", f"Expected 'both_invalid' when both files corrupted, got {result}"
print("Resume merge with both invalid test passed!")
def test_cleanup_interrupted_resume_both_corrupted():
"""Test that cleanup_interrupted_resume returns 'start_fresh' when both files are corrupted."""
from wikiq.resume import cleanup_interrupted_resume, get_checkpoint_path
import tempfile
import json
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "output.parquet")
temp_file = output_file + ".resume_temp"
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces=False)
# Create corrupted original file
with open(output_file, 'w') as f:
f.write("corrupted original")
# Create corrupted temp file
with open(temp_file, 'w') as f:
f.write("corrupted temp")
# Create a checkpoint file (should be deleted)
with open(checkpoint_path, 'w') as f:
json.dump({"pageid": 100, "revid": 200}, f)
# cleanup_interrupted_resume should return "start_fresh"
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
assert result == "start_fresh", f"Expected 'start_fresh', got {result}"
# All files should be deleted
assert not os.path.exists(output_file), "Corrupted original should be deleted"
assert not os.path.exists(temp_file), "Corrupted temp should be deleted"
assert not os.path.exists(checkpoint_path), "Stale checkpoint should be deleted"
print("Cleanup interrupted resume with both corrupted test passed!")
def test_cleanup_interrupted_resume_original_corrupted_temp_valid():
"""Test that cleanup recovers from temp when original is corrupted."""
from wikiq.resume import cleanup_interrupted_resume, get_resume_point
import pyarrow as pa
import pyarrow.parquet as pq
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
output_file = os.path.join(tmpdir, "output.parquet")
temp_file = output_file + ".resume_temp"
# Create corrupted original file
with open(output_file, 'w') as f:
f.write("corrupted original")
# Create valid temp file with some data
table = pa.table({"articleid": [10, 20, 30], "revid": [100, 200, 300]})
pq.write_table(table, temp_file)
# cleanup_interrupted_resume should recover from temp (not return "start_fresh")
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
assert result is None, f"Expected None (normal recovery), got {result}"
# Original should now contain the temp file's data
assert os.path.exists(output_file), "Output file should exist after recovery"
assert not os.path.exists(temp_file), "Temp file should be renamed to output"
# Verify the recovered data
recovered_table = pq.read_table(output_file)
assert len(recovered_table) == 3, "Recovered file should have 3 rows"
# get_resume_point should find the resume point from recovered file
resume_point = get_resume_point(output_file, partition_namespaces=False)
assert resume_point is not None, "Should find resume point from recovered file"
assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
print("Cleanup with original corrupted, temp valid test passed!")