make resume with jsonl output fault tolerant.
This commit is contained in:
@@ -624,8 +624,11 @@ class WikiqParser:
|
||||
return path.parent / f"{path.stem}.part{part_num}{path.suffix}"
|
||||
|
||||
def _open_checkpoint(self, output_file):
|
||||
"""Enable checkpointing for the given output file."""
|
||||
if (not self.output_jsonl and not self.output_parquet) or output_file == sys.stdout.buffer:
|
||||
"""Enable checkpointing for Parquet output only.
|
||||
|
||||
JSONL doesn't need checkpoint files - resume point is derived from last line.
|
||||
"""
|
||||
if not self.output_parquet or output_file == sys.stdout.buffer:
|
||||
return
|
||||
self.checkpoint_path = get_checkpoint_path(output_file, self.partition_namespaces)
|
||||
Path(self.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -1481,7 +1484,8 @@ def main():
|
||||
else:
|
||||
resume_point = get_resume_point(output_file, partition_namespaces)
|
||||
else:
|
||||
resume_point = read_checkpoint(get_checkpoint_path(output_file))
|
||||
# JSONL: get resume point from last line of file (no checkpoint)
|
||||
resume_point = get_resume_point(output_file, input_file=filename)
|
||||
if resume_point is not None:
|
||||
if isinstance(resume_point, dict):
|
||||
print(f"Resuming from checkpoint for {len(resume_point)} namespaces", file=sys.stderr)
|
||||
|
||||
Reference in New Issue
Block a user