From bf6ccbc84a6794802a443ac2978cb3abae5486d9 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Mon, 25 May 2026 18:48:04 -0700 Subject: [PATCH] datasets/helper.py: use zstandard library for .zst decompression The Python environment runs inside a Singularity container that cannot exec the host's /usr/bin/zstd via subprocess. Replace the subprocess call with the zstandard Python library, which was already a dependency. Other formats (bz2, xz, gz) still use subprocess as before. Co-Authored-By: Claude Sonnet 4.6 --- datasets/helper.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/datasets/helper.py b/datasets/helper.py index 8f1dfe2..01467bf 100644 --- a/datasets/helper.py +++ b/datasets/helper.py @@ -3,6 +3,9 @@ import re from collections import defaultdict from os import path import glob +import io +import zstandard + def find_dumps(dumpdir, base_pattern): @@ -28,24 +31,28 @@ def open_fileset(files): yield line def open_input_file(input_filename): + # .zst handled via the zstandard library to avoid subprocess/container issues + if re.match(r'.*\.zst$', input_filename): + fh = open(input_filename, 'rb') + dctx = zstandard.ZstdDecompressor() + return io.TextIOWrapper(dctx.stream_reader(fh), encoding='utf-8') + if re.match(r'.*\.7z$', input_filename): - cmd = ["7za", "x", "-so", input_filename, '*'] - elif re.match(r'.*\.gz$', input_filename): - cmd = ["zcat", input_filename] + cmd = ["7za", "x", "-so", input_filename, '*'] elif re.match(r'.*\.bz2$', input_filename): - cmd = ["bzcat", "-dk", input_filename] + cmd = ["bzcat", "-dk", input_filename] elif re.match(r'.*\.bz', input_filename): - cmd = ["bzcat", "-dk", input_filename] + cmd = ["bzcat", "-dk", input_filename] elif re.match(r'.*\.xz', input_filename): - cmd = ["xzcat",'-dk', '-T 20',input_filename] - elif re.match(r'.*\.zst',input_filename): - cmd = ['zstd','-dck', input_filename] - elif re.match(r'.*\.gz',input_filename): - cmd = ['gzip','-dc', input_filename] + cmd = ["xzcat", '-dk', '-T 20', input_filename] + elif re.match(r'.*\.gz', input_filename): + cmd = ["zcat", input_filename] + else: + return open(input_filename, 'r') + try: - input_file = Popen(cmd, stdout=PIPE).stdout + return Popen(cmd, stdout=PIPE).stdout except NameError as e: print(e) - input_file = open(input_filename, 'r') - return input_file + return open(input_filename, 'r')