fix jsonl.d output.
This commit is contained in:
@@ -527,6 +527,7 @@ class WikiqParser:
|
||||
headings: bool = False,
|
||||
time_limit_seconds: Union[float, None] = None,
|
||||
max_revisions_per_file: int = 0,
|
||||
input_filename: Union[str, None] = None,
|
||||
):
|
||||
"""
|
||||
Parameters:
|
||||
@@ -535,8 +536,10 @@ class WikiqParser:
|
||||
or a dict mapping namespace -> (pageid, revid) for partitioned output.
|
||||
For single-file: skip all revisions up to and including this point.
|
||||
max_revisions_per_file : if > 0, close and rotate output files after this many revisions
|
||||
input_filename : original input filename (needed for .jsonl.d output to derive output filename)
|
||||
"""
|
||||
self.input_file = input_file
|
||||
self.input_filename = input_filename
|
||||
|
||||
self.collapse_user: bool = collapse_user
|
||||
self.persist: int = persist
|
||||
@@ -845,13 +848,12 @@ class WikiqParser:
|
||||
if self.output_jsonl_dir:
|
||||
# Create directory for JSONL output
|
||||
Path(self.output_file).mkdir(parents=True, exist_ok=True)
|
||||
part_num = 0
|
||||
if self.resume_point is not None and len(self.resume_point) > 2:
|
||||
part_num = self.resume_point[2]
|
||||
part_numbers[None] = part_num
|
||||
jsonl_path = self._get_part_path(
|
||||
Path(self.output_file) / "data.jsonl", part_num
|
||||
)
|
||||
# Derive JSONL filename from input filename
|
||||
if self.input_filename:
|
||||
jsonl_basename = os.path.basename(get_output_filename(self.input_filename, 'jsonl'))
|
||||
else:
|
||||
jsonl_basename = "data.jsonl"
|
||||
jsonl_path = Path(self.output_file) / jsonl_basename
|
||||
writer = JSONLWriter(str(jsonl_path), schema, append=append_mode)
|
||||
else:
|
||||
writer = JSONLWriter(self.output_file, schema, append=append_mode)
|
||||
@@ -1527,6 +1529,7 @@ def main():
|
||||
headings=args.headings,
|
||||
time_limit_seconds=time_limit_seconds,
|
||||
max_revisions_per_file=args.max_revisions_per_file,
|
||||
input_filename=filename,
|
||||
)
|
||||
|
||||
# Register signal handlers for graceful shutdown (CLI only)
|
||||
|
||||
Reference in New Issue
Block a user