fix jsonl.d output.

This commit is contained in:
Nathan TeBlunthuis
2025-12-30 11:26:24 -08:00
parent 93f6ed0ff5
commit 8590e5f920
2 changed files with 92 additions and 7 deletions

View File

@@ -527,6 +527,7 @@ class WikiqParser:
headings: bool = False,
time_limit_seconds: Union[float, None] = None,
max_revisions_per_file: int = 0,
input_filename: Union[str, None] = None,
):
"""
Parameters:
@@ -535,8 +536,10 @@ class WikiqParser:
or a dict mapping namespace -> (pageid, revid) for partitioned output.
For single-file: skip all revisions up to and including this point.
max_revisions_per_file : if > 0, close and rotate output files after this many revisions
input_filename : original input filename (needed for .jsonl.d output to derive output filename)
"""
self.input_file = input_file
self.input_filename = input_filename
self.collapse_user: bool = collapse_user
self.persist: int = persist
@@ -845,13 +848,12 @@ class WikiqParser:
if self.output_jsonl_dir:
# Create directory for JSONL output
Path(self.output_file).mkdir(parents=True, exist_ok=True)
part_num = 0
if self.resume_point is not None and len(self.resume_point) > 2:
part_num = self.resume_point[2]
part_numbers[None] = part_num
jsonl_path = self._get_part_path(
Path(self.output_file) / "data.jsonl", part_num
)
# Derive JSONL filename from input filename
if self.input_filename:
jsonl_basename = os.path.basename(get_output_filename(self.input_filename, 'jsonl'))
else:
jsonl_basename = "data.jsonl"
jsonl_path = Path(self.output_file) / jsonl_basename
writer = JSONLWriter(str(jsonl_path), schema, append=append_mode)
else:
writer = JSONLWriter(self.output_file, schema, append=append_mode)
@@ -1527,6 +1529,7 @@ def main():
headings=args.headings,
time_limit_seconds=time_limit_seconds,
max_revisions_per_file=args.max_revisions_per_file,
input_filename=filename,
)
# Register signal handlers for graceful shutdown (CLI only)