make resume with jsonl output fault tolerant.
This commit is contained in:
@@ -624,8 +624,11 @@ class WikiqParser:
|
|||||||
return path.parent / f"{path.stem}.part{part_num}{path.suffix}"
|
return path.parent / f"{path.stem}.part{part_num}{path.suffix}"
|
||||||
|
|
||||||
def _open_checkpoint(self, output_file):
|
def _open_checkpoint(self, output_file):
|
||||||
"""Enable checkpointing for the given output file."""
|
"""Enable checkpointing for Parquet output only.
|
||||||
if (not self.output_jsonl and not self.output_parquet) or output_file == sys.stdout.buffer:
|
|
||||||
|
JSONL doesn't need checkpoint files - resume point is derived from last line.
|
||||||
|
"""
|
||||||
|
if not self.output_parquet or output_file == sys.stdout.buffer:
|
||||||
return
|
return
|
||||||
self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
|
self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
|
||||||
Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
|
Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -1481,7 +1484,8 @@ def main():
|
|||||||
else:
|
else:
|
||||||
resume_point = get_resume_point(output_file, partition_namespaces)
|
resume_point = get_resume_point(output_file, partition_namespaces)
|
||||||
else:
|
else:
|
||||||
resume_point = read_checkpoint(get_checkpoint_path(output_file))
|
# JSONL: get resume point from last line of file (no checkpoint)
|
||||||
|
resume_point = get_resume_point(output_file, input_file=filename)
|
||||||
if resume_point is not None:
|
if resume_point is not None:
|
||||||
if isinstance(resume_point, dict):
|
if isinstance(resume_point, dict):
|
||||||
print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)
|
print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ This module handles:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
import pyarrow.parquet as pq
|
import pyarrow.parquet as pq
|
||||||
|
|
||||||
@@ -153,31 +154,71 @@ def cleanup_interrupted_resume(output_file, partition_namespaces):
|
|||||||
os.remove(temp_output_file)
|
os.remove(temp_output_file)
|
||||||
|
|
||||||
|
|
||||||
def get_resume_point(output_file, partition_namespaces=False):
|
def get_jsonl_resume_point(output_file, input_file=None):
|
||||||
|
"""Get resume point from last complete line of JSONL file.
|
||||||
|
|
||||||
|
For .jsonl.d directories, derives the file path from input_file using get_output_filename.
|
||||||
|
"""
|
||||||
|
# Handle .jsonl.d directory output
|
||||||
|
if output_file.endswith('.jsonl.d'):
|
||||||
|
if input_file is None:
|
||||||
|
return None
|
||||||
|
if os.path.isdir(output_file):
|
||||||
|
# Import here to avoid circular import
|
||||||
|
from wikiq import get_output_filename
|
||||||
|
jsonl_filename = os.path.basename(get_output_filename(input_file, 'jsonl'))
|
||||||
|
output_file = os.path.join(output_file, jsonl_filename)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not os.path.exists(output_file):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(output_file) as f:
|
||||||
|
# Stream through file, keeping only last 2 lines in memory
|
||||||
|
for line in reversed(deque(f, maxlen=2)):
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
return (record['articleid'], record['revid'])
|
||||||
|
except (json.JSONDecodeError, KeyError):
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
except IOError as e:
|
||||||
|
print(f"Warning: Could not read {output_file}: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_resume_point(output_file, partition_namespaces=False, input_file=None):
|
||||||
"""
|
"""
|
||||||
Find the resume point(s) from existing output.
|
Find the resume point(s) from existing output.
|
||||||
|
|
||||||
First checks for a checkpoint file (fast), then falls back to scanning
|
For JSONL: reads last line of file (no checkpoint needed).
|
||||||
the parquet output (slow, for backwards compatibility).
|
For Parquet: checks checkpoint file, falls back to scanning parquet.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_file: Path to the output file.
|
output_file: Path to the output file.
|
||||||
partition_namespaces: Whether the output uses namespace partitioning.
|
partition_namespaces: Whether the output uses namespace partitioning.
|
||||||
|
input_file: Path to input file (needed for .jsonl.d directory output).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
For single files: A tuple (pageid, revid) or (pageid, revid, part), or None.
|
For single files: A tuple (pageid, revid) or (pageid, revid, part), or None.
|
||||||
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
|
For partitioned: A dict mapping namespace -> (pageid, revid, part), or None.
|
||||||
"""
|
"""
|
||||||
|
# For JSONL, read resume point directly from last line (no checkpoint needed)
|
||||||
|
if output_file.endswith('.jsonl') or output_file.endswith('.jsonl.d'):
|
||||||
|
result = get_jsonl_resume_point(output_file, input_file)
|
||||||
|
if result:
|
||||||
|
print(f"Resume point found from JSONL: pageid={result[0]}, revid={result[1]}", file=sys.stderr)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# For Parquet, use checkpoint file (fast)
|
||||||
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
|
checkpoint_path = get_checkpoint_path(output_file, partition_namespaces)
|
||||||
checkpoint_result = read_checkpoint(checkpoint_path, partition_namespaces)
|
checkpoint_result = read_checkpoint(checkpoint_path, partition_namespaces)
|
||||||
if checkpoint_result is not None:
|
if checkpoint_result is not None:
|
||||||
print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr)
|
print(f"Resume point found in checkpoint file {checkpoint_path}", file=sys.stderr)
|
||||||
return checkpoint_result
|
return checkpoint_result
|
||||||
|
|
||||||
# For JSONL, only checkpoint-based resume is supported
|
|
||||||
if output_file.endswith('.jsonl'):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Fall back to scanning parquet (slow, for backwards compatibility)
|
# Fall back to scanning parquet (slow, for backwards compatibility)
|
||||||
print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr)
|
print(f"No checkpoint file found at {checkpoint_path}, scanning parquet output...", file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user