support .jsonl.d
This commit is contained in:
@@ -515,6 +515,7 @@ class WikiqParser:
|
||||
namespaces: Union[list[int], None] = None,
|
||||
revert_radius: int = 15,
|
||||
output_jsonl: bool = False,
|
||||
output_jsonl_dir: bool = False,
|
||||
output_parquet: bool = False,
|
||||
batch_size: int = 1024,
|
||||
resume_point: Union[tuple, dict, None] = None,
|
||||
@@ -569,6 +570,7 @@ class WikiqParser:
|
||||
# Initialize output
|
||||
self.batch_size = batch_size
|
||||
self.output_jsonl = output_jsonl
|
||||
self.output_jsonl_dir = output_jsonl_dir
|
||||
self.output_parquet = output_parquet
|
||||
self.output_file = output_file
|
||||
|
||||
@@ -836,7 +838,19 @@ class WikiqParser:
|
||||
part_numbers[ns] = 0
|
||||
elif self.output_jsonl:
|
||||
append_mode = self.resume_point is not None
|
||||
writer = JSONLWriter(self.output_file, schema, append=append_mode)
|
||||
if self.output_jsonl_dir:
|
||||
# Create directory for JSONL output
|
||||
Path(self.output_file).mkdir(parents=True, exist_ok=True)
|
||||
part_num = 0
|
||||
if self.resume_point is not None and len(self.resume_point) > 2:
|
||||
part_num = self.resume_point[2]
|
||||
part_numbers[None] = part_num
|
||||
jsonl_path = self._get_part_path(
|
||||
Path(self.output_file) / "data.jsonl", part_num
|
||||
)
|
||||
writer = JSONLWriter(str(jsonl_path), schema, append=append_mode)
|
||||
else:
|
||||
writer = JSONLWriter(self.output_file, schema, append=append_mode)
|
||||
else:
|
||||
writer = pacsv.CSVWriter(
|
||||
self.output_file,
|
||||
@@ -1437,7 +1451,8 @@ def main():
|
||||
output = "."
|
||||
|
||||
# Detect output format from extension
|
||||
output_jsonl = output.endswith(".jsonl") or output.endswith(".jsonl.d")
|
||||
output_jsonl_dir = output.endswith(".jsonl.d")
|
||||
output_jsonl = output.endswith(".jsonl") or output_jsonl_dir
|
||||
output_parquet = output.endswith(".parquet")
|
||||
partition_namespaces = args.partition_namespaces and output_parquet
|
||||
|
||||
@@ -1495,6 +1510,7 @@ def main():
|
||||
text=args.text,
|
||||
diff=args.diff,
|
||||
output_jsonl=output_jsonl,
|
||||
output_jsonl_dir=output_jsonl_dir,
|
||||
output_parquet=output_parquet,
|
||||
partition_namespaces=partition_namespaces,
|
||||
batch_size=args.batch_size,
|
||||
|
||||
@@ -43,8 +43,8 @@ class WikiqTester:
|
||||
shutil.rmtree(self.output)
|
||||
|
||||
# Also clean up resume-related files
|
||||
for suffix in [".resume_temp", ".checkpoint", ".merged"]:
|
||||
temp_path = self.output + suffix
|
||||
for temp_suffix in [".resume_temp", ".checkpoint", ".merged"]:
|
||||
temp_path = self.output + temp_suffix
|
||||
if os.path.exists(temp_path):
|
||||
if os.path.isfile(temp_path):
|
||||
os.remove(temp_path)
|
||||
|
||||
Reference in New Issue
Block a user