make resume with jsonl output fault tolerant.

This commit is contained in:
Nathan TeBlunthuis
2025-12-23 09:09:51 -08:00
parent 9e6b0fb64c
commit 5ebdb26d82
2 changed files with 55 additions and 10 deletions

View File

@@ -624,8 +624,11 @@ class WikiqParser:
return path.parent / f"{path.stem}.part{part_num}{path.suffix}" return path.parent / f"{path.stem}.part{part_num}{path.suffix}"
def _open_checkpoint(self, output_file): def _open_checkpoint(self, output_file):
"""Enable checkpointing for the given output file.""" """Enable checkpointing for Parquet output only.
if (not self.output_jsonl and not self.output_parquet) or output_file == sys.stdout.buffer:
JSONL doesn't need checkpoint files - resume point is derived from last line.
"""
if not self.output_parquet or output_file == sys.stdout.buffer:
return return
self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces) self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True) Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
@@ -1481,7 +1484,8 @@ def main():
else: else:
resume_point = get_resume_point(output_file, partition_namespaces) resume_point = get_resume_point(output_file, partition_namespaces)
else: else:
resume_point = read_checkpoint(get_checkpoint_path(output_file)) # JSONL: get resume point from last line of file (no checkpoint)
resume_point = get_resume_point(output_file, input_file=filename)
if resume_point is not None: if resume_point is not None:
if isinstance(resume_point, dict): if isinstance(resume_point, dict):
print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr) print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)

View File

@@ -10,6 +10,7 @@ This module handles:
import json import json
import os import os
import sys import sys
from collections import deque
import pyarrow.parquet as pq import pyarrow.parquet as pq
@@ -153,31 +154,71 @@ def cleanup_interrupted_resume(output_file, partition_namespaces):
os.remove(temp_output_file) os.remove(temp_output_file)
def get_resume_point(output_file, partition_namespaces=False): def get_jsonl_resume_point(output_file, input_file=None):
"""Get resume point from last complete line of JSONL file.
For .jsonl.d directories, derives the file path from input_file using get_output_filename.
"""
# Handle .jsonl.d directory output
if output_file.endswith('.jsonl.d'):
if input_file is None:
return None
if os.path.isdir(output_file):
# Import here to avoid circular import
from wikiq import get_output_filename
jsonl_filename = os.path.basename(get_output_filename(input_file, 'jsonl'))
output_file = os.path.join(output_file, jsonl_filename)
else:
return None
if not os.path.exists(output_file):
return None
try:
with open(output_file) as f:
# Stream through file, keeping only last 2 lines in memory
for line in reversed(deque(f, maxlen=2)):
try:
record = json.loads(line)
return (record['articleid'], record['revid'])
except (json.JSONDecodeError, KeyError):
continue
return None
except IOError as e:
print(f"Warning: Could not read {output_file}: {e}", file=sys.stderr)
return None
def get_resume_point(output_file, partition_namespaces=False, input_file=None):
""" """
Find the resume point(s) from existing output. Find the resume point(s) from existing output.
First checks for a checkpoint file (fast), then falls back to scanning For JSONL: reads last line of file (no checkpoint needed).
the parquet output (slow, for backwards compatibility). For Parquet: checks checkpoint file, falls back to scanning parquet.
Args: Args:
output_file: Path to the output file. output_file: Path to the output file.
partition_namespaces: Whether the output uses namespace partitioning. partition_namespaces: Whether the output uses namespace partitioning.
input_file: Path to input file (needed for .jsonl.d directory output).
Returns: Returns:
For single files: A tuple (pageid, revid) or (pageid, revid, part), or None. For single files: A tuple (pageid, revid) or (pageid, revid, part), or None.
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None. For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
""" """
# For JSONL, read resume point directly from last line (no checkpoint needed)
if output_file.endswith('.jsonl') or output_file.endswith('.jsonl.d'):
result = get_jsonl_resume_point(output_file, input_file)
if result:
print(f"Resume point found from JSONL: pageid={result[0]}, revid={result[1]}", file=sys.stderr)
return result
# For Parquet, use checkpoint file (fast)
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces) checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
checkpoint_result = read_checkpoint(checkpoint_path, partition_namespaces) checkpoint_result = read_checkpoint(checkpoint_path, partition_namespaces)
if checkpoint_result is not None: if checkpoint_result is not None:
print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr) print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr)
return checkpoint_result return checkpoint_result
# For JSONL, only checkpoint-based resume is supported
if output_file.endswith('.jsonl'):
return None
# Fall back to scanning parquet (slow, for backwards compatibility) # Fall back to scanning parquet (slow, for backwards compatibility)
print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr) print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr)
try: try: