From d822085698b4eed7893009d1693a3f6b821e4f02 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Mon, 22 Dec 2025 20:13:04 -0800 Subject: [PATCH] support .jsonl.d --- src/wikiq/__init__.py | 20 ++++++++++++++++++-- test/wikiq_test_utils.py | 4 ++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/wikiq/__init__.py b/src/wikiq/__init__.py index 929b571..6172fa4 100755 --- a/src/wikiq/__init__.py +++ b/src/wikiq/__init__.py @@ -515,6 +515,7 @@ class WikiqParser: namespaces: Union[list[int], None] = None, revert_radius: int = 15, output_jsonl: bool = False, + output_jsonl_dir: bool = False, output_parquet: bool = False, batch_size: int = 1024, resume_point: Union[tuple, dict, None] = None, @@ -569,6 +570,7 @@ class WikiqParser: # Initialize output self.batch_size = batch_size self.output_jsonl = output_jsonl + self.output_jsonl_dir = output_jsonl_dir self.output_parquet = output_parquet self.output_file = output_file @@ -836,7 +838,19 @@ class WikiqParser: part_numbers[ns] = 0 elif self.output_jsonl: append_mode = self.resume_point is not None - writer = JSONLWriter(self.output_file, schema, append=append_mode) + if self.output_jsonl_dir: + # Create directory for JSONL output + Path(self.output_file).mkdir(parents=True, exist_ok=True) + part_num = 0 + if self.resume_point is not None and len(self.resume_point) > 2: + part_num = self.resume_point[2] + part_numbers[None] = part_num + jsonl_path = self._get_part_path( + Path(self.output_file) / "data.jsonl", part_num + ) + writer = JSONLWriter(str(jsonl_path), schema, append=append_mode) + else: + writer = JSONLWriter(self.output_file, schema, append=append_mode) else: writer = pacsv.CSVWriter( self.output_file, @@ -1437,7 +1451,8 @@ def main(): output = "." # Detect output format from extension - output_jsonl = output.endswith(".jsonl") or output.endswith(".jsonl.d") + output_jsonl_dir = output.endswith(".jsonl.d") + output_jsonl = output.endswith(".jsonl") or output_jsonl_dir output_parquet = output.endswith(".parquet") partition_namespaces = args.partition_namespaces and output_parquet @@ -1495,6 +1510,7 @@ def main(): text=args.text, diff=args.diff, output_jsonl=output_jsonl, + output_jsonl_dir=output_jsonl_dir, output_parquet=output_parquet, partition_namespaces=partition_namespaces, batch_size=args.batch_size, diff --git a/test/wikiq_test_utils.py b/test/wikiq_test_utils.py index ea415ec..77b0caf 100644 --- a/test/wikiq_test_utils.py +++ b/test/wikiq_test_utils.py @@ -43,8 +43,8 @@ class WikiqTester: shutil.rmtree(self.output) # Also clean up resume-related files - for suffix in [".resume_temp", ".checkpoint", ".merged"]: - temp_path = self.output + suffix + for temp_suffix in [".resume_temp", ".checkpoint", ".merged"]: + temp_path = self.output + temp_suffix if os.path.exists(temp_path): if os.path.isfile(temp_path): os.remove(temp_path)