diff --git a/src/wikiq/resume.py b/src/wikiq/resume.py index 79cc212..b88d4e3 100644 --- a/src/wikiq/resume.py +++ b/src/wikiq/resume.py @@ -175,15 +175,32 @@ def get_jsonl_resume_point(output_file, input_file=None): return None try: - with open(output_file) as f: - # Stream through file, keeping only last 2 lines in memory - for line in reversed(deque(f, maxlen=2)): + # Track positions of last two valid lines for potential truncation + valid_lines = deque(maxlen=2) # (end_position, record) + with open(output_file, 'rb') as f: + while True: + line = f.readline() + if not line: + break try: - record = json.loads(line) - return (record['articleid'], record['revid']) - except (json.JSONDecodeError, KeyError): - continue - return None + record = json.loads(line.decode('utf-8')) + valid_lines.append((f.tell(), record)) + except (json.JSONDecodeError, KeyError, UnicodeDecodeError): + pass + + if not valid_lines: + return None + + last_valid_pos, last_valid_record = valid_lines[-1] + + # Truncate if file extends past last valid line (corrupted trailing data) + file_size = os.path.getsize(output_file) + if last_valid_pos < file_size: + print(f"Truncating corrupted data from {output_file} ({file_size - last_valid_pos} bytes)", file=sys.stderr) + with open(output_file, 'r+b') as f: + f.truncate(last_valid_pos) + + return (last_valid_record['articleid'], last_valid_record['revid']) except IOError as e: print(f"Warning: Could not read {output_file}: {e}", file=sys.stderr) return None diff --git a/test/test_resume.py b/test/test_resume.py index e5d5994..5dd8242 100644 --- a/test/test_resume.py +++ b/test/test_resume.py @@ -458,7 +458,10 @@ def test_resume_corrupted_jsonl_last_line(): """Test that JSONL resume correctly handles corrupted/incomplete last line. When the previous run was interrupted mid-write leaving an incomplete JSON - line, the resume should detect and remove the corrupted line before appending. + line, the resume should: + 1. Find the resume point from the last valid line (no checkpoint file needed) + 2. Truncate the corrupted trailing data + 3. Append new data, resulting in valid JSONL """ tester_full = WikiqTester(SAILORMOON, "resume_corrupt_full", in_compression="7z", out_format="jsonl") @@ -481,19 +484,20 @@ def test_resume_corrupted_jsonl_last_line(): # Write incomplete JSON (simulates crash mid-write) f.write('{"revid": 999, "articleid": 123, "incomplet') - # Write checkpoint pointing to a valid revision (last complete row) - checkpoint_path = get_checkpoint_path(corrupt_output_path) - with open(checkpoint_path, 'w') as f: - json.dump({"pageid": full_rows[resume_idx - 1]["articleid"], - "revid": full_rows[resume_idx - 1]["revid"]}, f) + # Record file size before resume + size_before = os.path.getsize(corrupt_output_path) - # Resume should detect and remove the corrupted line, then append new data + # NO checkpoint file - JSONL resume works from last valid line in the file + checkpoint_path = get_checkpoint_path(corrupt_output_path) + assert not os.path.exists(checkpoint_path), "Test setup error: checkpoint should not exist" + + # Resume should detect corrupted line, truncate it, then append new data try: tester_corrupt.call_wikiq("--fandom-2020", "--resume") except subprocess.CalledProcessError as exc: pytest.fail(f"Resume failed unexpectedly: {exc.stderr.decode('utf8')}") - # Verify the file is valid JSONL and readable + # Verify the file is valid JSONL and readable (no corrupted lines) resumed_rows = read_jsonl(corrupt_output_path) # Full data equivalence check