make resume with jsonl output fault tolerant.

This commit is contained in:
Nathan TeBlunthuis
2025-12-23 09:09:51 -08:00
parent 9e6b0fb64c
commit 5ebdb26d82
2 changed files with 55 additions and 10 deletions

View File

@@ -10,6 +10,7 @@ This module handles:
import json
import os
import sys
from collections import deque
import pyarrow.parquet as pq
@@ -153,31 +154,71 @@ def cleanup_interrupted_resume(output_file, partition_namespaces):
os.remove(temp_output_file)
def get_resume_point(output_file, partition_namespaces=False):
def get_jsonl_resume_point(output_file, input_file=None):
"""Get resume point from last complete line of JSONL file.
For .jsonl.d directories, derives the file path from input_file using get_output_filename.
"""
# Handle .jsonl.d directory output
if output_file.endswith('.jsonl.d'):
if input_file is None:
return None
if os.path.isdir(output_file):
# Import here to avoid circular import
from wikiq import get_output_filename
jsonl_filename = os.path.basename(get_output_filename(input_file, 'jsonl'))
output_file = os.path.join(output_file, jsonl_filename)
else:
return None
if not os.path.exists(output_file):
return None
try:
with open(output_file) as f:
# Stream through file, keeping only last 2 lines in memory
for line in reversed(deque(f, maxlen=2)):
try:
record = json.loads(line)
return (record['articleid'], record['revid'])
except (json.JSONDecodeError, KeyError):
continue
return None
except IOError as e:
print(f"Warning: Could not read {output_file}: {e}", file=sys.stderr)
return None
def get_resume_point(output_file, partition_namespaces=False, input_file=None):
"""
Find the resume point(s) from existing output.
First checks for a checkpoint file (fast), then falls back to scanning
the parquet output (slow, for backwards compatibility).
For JSONL: reads last line of file (no checkpoint needed).
For Parquet: checks checkpoint file, falls back to scanning parquet.
Args:
output_file: Path to the output file.
partition_namespaces: Whether the output uses namespace partitioning.
input_file: Path to input file (needed for .jsonl.d directory output).
Returns:
For single files: A tuple (pageid, revid) or (pageid, revid, part), or None.
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
"""
# For JSONL, read resume point directly from last line (no checkpoint needed)
if output_file.endswith('.jsonl') or output_file.endswith('.jsonl.d'):
result = get_jsonl_resume_point(output_file, input_file)
if result:
print(f"Resume point found from JSONL: pageid={result[0]}, revid={result[1]}", file=sys.stderr)
return result
# For Parquet, use checkpoint file (fast)
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
checkpoint_result = read_checkpoint(checkpoint_path, partition_namespaces)
if checkpoint_result is not None:
print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr)
return checkpoint_result
# For JSONL, only checkpoint-based resume is supported
if output_file.endswith('.jsonl'):
return None
# Fall back to scanning parquet (slow, for backwards compatibility)
print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr)
try: