add logic for resuming after a resume.

This commit is contained in:
Nathan TeBlunthuis
2025-12-10 19:26:54 -08:00
parent d1fc094c96
commit f427291fd8
3 changed files with 132 additions and 16 deletions

View File

@@ -1043,3 +1043,43 @@ def test_resume_simple():
assert_frame_equal(resumed_df, full_df, check_like=True, check_dtype=False)
print(f"Resume simple test passed! Original: {len(full_df)} rows, Resumed: {len(resumed_df)} rows")
def test_resume_merge_with_invalid_temp_file():
"""Test that resume handles invalid/empty temp files gracefully.
This can happen when a namespace has no records after the resume point,
resulting in a temp file that was created but never written to.
"""
import pyarrow.parquet as pq
from wikiq.resume import merge_parquet_files, merge_partitioned_namespaces
import tempfile
# Create a valid parquet file
with tempfile.TemporaryDirectory() as tmpdir:
original_path = os.path.join(tmpdir, "original.parquet")
temp_path = os.path.join(tmpdir, "temp.parquet")
merged_path = os.path.join(tmpdir, "merged.parquet")
# Create a valid original file
import pyarrow as pa
table = pa.table({"articleid": [1, 2, 3], "revid": [10, 20, 30]})
pq.write_table(table, original_path)
# Create an invalid temp file (empty file, not valid parquet)
with open(temp_path, 'w') as f:
f.write("")
# merge_parquet_files should return None for invalid temp file
result = merge_parquet_files(original_path, temp_path, merged_path)
assert result is None, "Expected None when temp file is invalid"
# Original file should still exist and be unchanged
assert os.path.exists(original_path), "Original file should still exist"
original_table = pq.read_table(original_path)
assert len(original_table) == 3, "Original file should be unchanged"
# Merged file should not have been created
assert not os.path.exists(merged_path), "Merged file should not be created"
print("Resume merge with invalid temp file test passed!")