refactor and enable jsonl output.

This commit is contained in:
Nathan TeBlunthuis
2025-12-21 23:42:18 -08:00
parent 6988a281dc
commit 3f1a9ba862
7 changed files with 1429 additions and 1242 deletions

View File

@@ -42,8 +42,20 @@ class WikiqTester:
else:
shutil.rmtree(self.output)
if out_format == "parquet":
os.makedirs(self.output, exist_ok=True)
# Also clean up resume-related files
for suffix in [".resume_temp", ".checkpoint", ".merged"]:
temp_path = self.output + suffix
if os.path.exists(temp_path):
if os.path.isfile(temp_path):
os.remove(temp_path)
else:
shutil.rmtree(temp_path)
# For JSONL and Parquet, self.output is a file path. Create parent directory if needed.
if out_format in ("jsonl", "parquet"):
parent_dir = os.path.dirname(self.output)
if parent_dir:
os.makedirs(parent_dir, exist_ok=True)
if suffix is None:
self.wikiq_baseline_name = "{0}.{1}".format(wiki, baseline_format)