make resume with jsonl output fault tolerant.

This commit is contained in:
Nathan TeBlunthuis
2025-12-23 09:09:51 -08:00
parent 9e6b0fb64c
commit 5ebdb26d82
2 changed files with 55 additions and 10 deletions

View File

@@ -624,8 +624,11 @@ class WikiqParser:
return path.parent / f"{path.stem}.part{part_num}{path.suffix}"
def _open_checkpoint(self, output_file):
"""Enable checkpointing for the given output file."""
if (not self.output_jsonl and not self.output_parquet) or output_file == sys.stdout.buffer:
"""Enable checkpointing for Parquet output only.
JSONL doesn't need checkpoint files - resume point is derived from last line.
"""
if not self.output_parquet or output_file == sys.stdout.buffer:
return
self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
@@ -1481,7 +1484,8 @@ def main():
else:
resume_point = get_resume_point(output_file, partition_namespaces)
else:
resume_point = read_checkpoint(get_checkpoint_path(output_file))
# JSONL: get resume point from last line of file (no checkpoint)
resume_point = get_resume_point(output_file, input_file=filename)
if resume_point is not None:
if isinstance(resume_point, dict):
print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)