handle case when we have a valid resume file, but a corrupted original.
This commit is contained in:
@@ -1070,9 +1070,9 @@ def test_resume_merge_with_invalid_temp_file():
|
||||
with open(temp_path, 'w') as f:
|
||||
f.write("")
|
||||
|
||||
# merge_parquet_files should return None for invalid temp file
|
||||
# merge_parquet_files should return "original_only" for invalid temp file
|
||||
result = merge_parquet_files(original_path, temp_path, merged_path)
|
||||
assert result is None, "Expected None when temp file is invalid"
|
||||
assert result == "original_only", f"Expected 'original_only' when temp file is invalid, got {result}"
|
||||
|
||||
# Original file should still exist and be unchanged
|
||||
assert os.path.exists(original_path), "Original file should still exist"
|
||||
@@ -1083,3 +1083,136 @@ def test_resume_merge_with_invalid_temp_file():
|
||||
assert not os.path.exists(merged_path), "Merged file should not be created"
|
||||
|
||||
print("Resume merge with invalid temp file test passed!")
|
||||
|
||||
|
||||
def test_resume_merge_with_corrupted_original():
|
||||
"""Test that resume recovers from a corrupted original file if temp is valid.
|
||||
|
||||
This can happen if the original file was being written when the process
|
||||
was killed, leaving it in a corrupted state.
|
||||
"""
|
||||
import pyarrow.parquet as pq
|
||||
from wikiq.resume import merge_parquet_files
|
||||
import tempfile
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
original_path = os.path.join(tmpdir, "original.parquet")
|
||||
temp_path = os.path.join(tmpdir, "temp.parquet")
|
||||
merged_path = os.path.join(tmpdir, "merged.parquet")
|
||||
|
||||
# Create a corrupted original file (not valid parquet)
|
||||
with open(original_path, 'w') as f:
|
||||
f.write("corrupted data")
|
||||
|
||||
# Create a valid temp file
|
||||
import pyarrow as pa
|
||||
table = pa.table({"articleid": [4, 5, 6], "revid": [40, 50, 60]})
|
||||
pq.write_table(table, temp_path)
|
||||
|
||||
# merge_parquet_files should return "temp_only" for corrupted original
|
||||
result = merge_parquet_files(original_path, temp_path, merged_path)
|
||||
assert result == "temp_only", f"Expected 'temp_only' when original is corrupted, got {result}"
|
||||
|
||||
# Merged file should not have been created (caller handles renaming temp)
|
||||
assert not os.path.exists(merged_path), "Merged file should not be created for temp_only case"
|
||||
|
||||
print("Resume merge with corrupted original test passed!")
|
||||
|
||||
|
||||
def test_resume_merge_both_invalid():
|
||||
"""Test that resume handles both files being invalid."""
|
||||
from wikiq.resume import merge_parquet_files
|
||||
import tempfile
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
original_path = os.path.join(tmpdir, "original.parquet")
|
||||
temp_path = os.path.join(tmpdir, "temp.parquet")
|
||||
merged_path = os.path.join(tmpdir, "merged.parquet")
|
||||
|
||||
# Create corrupted original file
|
||||
with open(original_path, 'w') as f:
|
||||
f.write("corrupted original")
|
||||
|
||||
# Create corrupted temp file
|
||||
with open(temp_path, 'w') as f:
|
||||
f.write("corrupted temp")
|
||||
|
||||
# merge_parquet_files should return "both_invalid"
|
||||
result = merge_parquet_files(original_path, temp_path, merged_path)
|
||||
assert result == "both_invalid", f"Expected 'both_invalid' when both files corrupted, got {result}"
|
||||
|
||||
print("Resume merge with both invalid test passed!")
|
||||
|
||||
|
||||
def test_cleanup_interrupted_resume_both_corrupted():
|
||||
"""Test that cleanup_interrupted_resume returns 'start_fresh' when both files are corrupted."""
|
||||
from wikiq.resume import cleanup_interrupted_resume, get_checkpoint_path
|
||||
import tempfile
|
||||
import json
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_file = os.path.join(tmpdir, "output.parquet")
|
||||
temp_file = output_file + ".resume_temp"
|
||||
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces=False)
|
||||
|
||||
# Create corrupted original file
|
||||
with open(output_file, 'w') as f:
|
||||
f.write("corrupted original")
|
||||
|
||||
# Create corrupted temp file
|
||||
with open(temp_file, 'w') as f:
|
||||
f.write("corrupted temp")
|
||||
|
||||
# Create a checkpoint file (should be deleted)
|
||||
with open(checkpoint_path, 'w') as f:
|
||||
json.dump({"pageid": 100, "revid": 200}, f)
|
||||
|
||||
# cleanup_interrupted_resume should return "start_fresh"
|
||||
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
|
||||
assert result == "start_fresh", f"Expected 'start_fresh', got {result}"
|
||||
|
||||
# All files should be deleted
|
||||
assert not os.path.exists(output_file), "Corrupted original should be deleted"
|
||||
assert not os.path.exists(temp_file), "Corrupted temp should be deleted"
|
||||
assert not os.path.exists(checkpoint_path), "Stale checkpoint should be deleted"
|
||||
|
||||
print("Cleanup interrupted resume with both corrupted test passed!")
|
||||
|
||||
|
||||
def test_cleanup_interrupted_resume_original_corrupted_temp_valid():
|
||||
"""Test that cleanup recovers from temp when original is corrupted."""
|
||||
from wikiq.resume import cleanup_interrupted_resume, get_resume_point
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
import tempfile
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_file = os.path.join(tmpdir, "output.parquet")
|
||||
temp_file = output_file + ".resume_temp"
|
||||
|
||||
# Create corrupted original file
|
||||
with open(output_file, 'w') as f:
|
||||
f.write("corrupted original")
|
||||
|
||||
# Create valid temp file with some data
|
||||
table = pa.table({"articleid": [10, 20, 30], "revid": [100, 200, 300]})
|
||||
pq.write_table(table, temp_file)
|
||||
|
||||
# cleanup_interrupted_resume should recover from temp (not return "start_fresh")
|
||||
result = cleanup_interrupted_resume(output_file, partition_namespaces=False)
|
||||
assert result is None, f"Expected None (normal recovery), got {result}"
|
||||
|
||||
# Original should now contain the temp file's data
|
||||
assert os.path.exists(output_file), "Output file should exist after recovery"
|
||||
assert not os.path.exists(temp_file), "Temp file should be renamed to output"
|
||||
|
||||
# Verify the recovered data
|
||||
recovered_table = pq.read_table(output_file)
|
||||
assert len(recovered_table) == 3, "Recovered file should have 3 rows"
|
||||
|
||||
# get_resume_point should find the resume point from recovered file
|
||||
resume_point = get_resume_point(output_file, partition_namespaces=False)
|
||||
assert resume_point is not None, "Should find resume point from recovered file"
|
||||
assert resume_point == (30, 300), f"Expected (30, 300), got {resume_point}"
|
||||
|
||||
print("Cleanup with original corrupted, temp valid test passed!")
|
||||
|
||||
Reference in New Issue
Block a user