fix bug by truncating corrupted jsonl lines.

2025-12-23 19:52:37 -08:00
parent 5ebdb26d82
commit 93f6ed0ff5
2 changed files with 37 additions and 16 deletions
--- a/src/wikiq/resume.py
+++ b/src/wikiq/resume.py
@@ -175,15 +175,32 @@ def get_jsonl_resume_point(output_file, input_file=None):
        return None

    try:
-        with open(output_file) as f:
-            # Stream through file, keeping only last 2 lines in memory
-            for line in reversed(deque(f, maxlen=2)):
+        # Track positions of last two valid lines for potential truncation
+        valid_lines = deque(maxlen=2)  # (end_position, record)
+        with open(output_file, 'rb') as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
                try:
-                    record = json.loads(line)
-                    return (record['articleid'], record['revid'])
-                except (json.JSONDecodeError, KeyError):
-                    continue
-        return None
+                    record = json.loads(line.decode('utf-8'))
+                    valid_lines.append((f.tell(), record))
+                except (json.JSONDecodeError, KeyError, UnicodeDecodeError):
+                    pass
+
+        if not valid_lines:
+            return None
+
+        last_valid_pos, last_valid_record = valid_lines[-1]
+
+        # Truncate if file extends past last valid line (corrupted trailing data)
+        file_size = os.path.getsize(output_file)
+        if last_valid_pos < file_size:
+            print(f"Truncating corrupted data from {output_file} ({file_size - last_valid_pos} bytes)", file=sys.stderr)
+            with open(output_file, 'r+b') as f:
+                f.truncate(last_valid_pos)
+
+        return (last_valid_record['articleid'], last_valid_record['revid'])
    except IOError as e:
        print(f"Warning: Could not read {output_file}: {e}", file=sys.stderr)
        return None
--- a/test/test_resume.py
+++ b/test/test_resume.py
@@ -458,7 +458,10 @@ def test_resume_corrupted_jsonl_last_line():
    """Test that JSONL resume correctly handles corrupted/incomplete last line.

    When the previous run was interrupted mid-write leaving an incomplete JSON
-    line, the resume should detect and remove the corrupted line before appending.
+    line, the resume should:
+    1. Find the resume point from the last valid line (no checkpoint file needed)
+    2. Truncate the corrupted trailing data
+    3. Append new data, resulting in valid JSONL
    """
    tester_full = WikiqTester(SAILORMOON, "resume_corrupt_full", in_compression="7z", out_format="jsonl")

@@ -481,19 +484,20 @@ def test_resume_corrupted_jsonl_last_line():
        # Write incomplete JSON (simulates crash mid-write)
        f.write('{"revid": 999, "articleid": 123, "incomplet')

-    # Write checkpoint pointing to a valid revision (last complete row)
-    checkpoint_path = get_checkpoint_path(corrupt_output_path)
-    with open(checkpoint_path, 'w') as f:
-        json.dump({"pageid": full_rows[resume_idx - 1]["articleid"],
-                   "revid": full_rows[resume_idx - 1]["revid"]}, f)
+    # Record file size before resume
+    size_before = os.path.getsize(corrupt_output_path)

-    # Resume should detect and remove the corrupted line, then append new data
+    # NO checkpoint file - JSONL resume works from last valid line in the file
+    checkpoint_path = get_checkpoint_path(corrupt_output_path)
+    assert not os.path.exists(checkpoint_path), "Test setup error: checkpoint should not exist"
+
+    # Resume should detect corrupted line, truncate it, then append new data
    try:
        tester_corrupt.call_wikiq("--fandom-2020", "--resume")
    except subprocess.CalledProcessError as exc:
        pytest.fail(f"Resume failed unexpectedly: {exc.stderr.decode('utf8')}")

-    # Verify the file is valid JSONL and readable
+    # Verify the file is valid JSONL and readable (no corrupted lines)
    resumed_rows = read_jsonl(corrupt_output_path)

    # Full data equivalence check