fix bug by truncating corrupted jsonl lines.

This commit is contained in:
Nathan TeBlunthuis
2025-12-23 19:52:37 -08:00
parent 5ebdb26d82
commit 93f6ed0ff5
2 changed files with 37 additions and 16 deletions

View File

@@ -458,7 +458,10 @@ def test_resume_corrupted_jsonl_last_line():
"""Test that JSONL resume correctly handles corrupted/incomplete last line.
When the previous run was interrupted mid-write leaving an incomplete JSON
line, the resume should detect and remove the corrupted line before appending.
line, the resume should:
1. Find the resume point from the last valid line (no checkpoint file needed)
2. Truncate the corrupted trailing data
3. Append new data, resulting in valid JSONL
"""
tester_full = WikiqTester(SAILORMOON, "resume_corrupt_full", in_compression="7z", out_format="jsonl")
@@ -481,19 +484,20 @@ def test_resume_corrupted_jsonl_last_line():
# Write incomplete JSON (simulates crash mid-write)
f.write('{"revid": 999, "articleid": 123, "incomplet')
# Write checkpoint pointing to a valid revision (last complete row)
checkpoint_path = get_checkpoint_path(corrupt_output_path)
with open(checkpoint_path, 'w') as f:
json.dump({"pageid": full_rows[resume_idx - 1]["articleid"],
"revid": full_rows[resume_idx - 1]["revid"]}, f)
# Record file size before resume
size_before = os.path.getsize(corrupt_output_path)
# Resume should detect and remove the corrupted line, then append new data
# NO checkpoint file - JSONL resume works from last valid line in the file
checkpoint_path = get_checkpoint_path(corrupt_output_path)
assert not os.path.exists(checkpoint_path), "Test setup error: checkpoint should not exist"
# Resume should detect corrupted line, truncate it, then append new data
try:
tester_corrupt.call_wikiq("--fandom-2020", "--resume")
except subprocess.CalledProcessError as exc:
pytest.fail(f"Resume failed unexpectedly: {exc.stderr.decode('utf8')}")
# Verify the file is valid JSONL and readable
# Verify the file is valid JSONL and readable (no corrupted lines)
resumed_rows = read_jsonl(corrupt_output_path)
# Full data equivalence check