datasets/helper.py: use zstandard library for .zst decompression
The Python environment runs inside a Singularity container that cannot exec the host's /usr/bin/zstd via subprocess. Replace the subprocess call with the zstandard Python library, which was already a dependency. Other formats (bz2, xz, gz) still use subprocess as before. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,9 @@ import re
|
||||
from collections import defaultdict
|
||||
from os import path
|
||||
import glob
|
||||
import io
|
||||
import zstandard
|
||||
|
||||
|
||||
def find_dumps(dumpdir, base_pattern):
|
||||
|
||||
@@ -28,24 +31,28 @@ def open_fileset(files):
|
||||
yield line
|
||||
|
||||
def open_input_file(input_filename):
|
||||
# .zst handled via the zstandard library to avoid subprocess/container issues
|
||||
if re.match(r'.*\.zst$', input_filename):
|
||||
fh = open(input_filename, 'rb')
|
||||
dctx = zstandard.ZstdDecompressor()
|
||||
return io.TextIOWrapper(dctx.stream_reader(fh), encoding='utf-8')
|
||||
|
||||
if re.match(r'.*\.7z$', input_filename):
|
||||
cmd = ["7za", "x", "-so", input_filename, '*']
|
||||
elif re.match(r'.*\.gz$', input_filename):
|
||||
cmd = ["zcat", input_filename]
|
||||
cmd = ["7za", "x", "-so", input_filename, '*']
|
||||
elif re.match(r'.*\.bz2$', input_filename):
|
||||
cmd = ["bzcat", "-dk", input_filename]
|
||||
cmd = ["bzcat", "-dk", input_filename]
|
||||
elif re.match(r'.*\.bz', input_filename):
|
||||
cmd = ["bzcat", "-dk", input_filename]
|
||||
cmd = ["bzcat", "-dk", input_filename]
|
||||
elif re.match(r'.*\.xz', input_filename):
|
||||
cmd = ["xzcat",'-dk', '-T 20',input_filename]
|
||||
elif re.match(r'.*\.zst',input_filename):
|
||||
cmd = ['zstd','-dck', input_filename]
|
||||
elif re.match(r'.*\.gz',input_filename):
|
||||
cmd = ['gzip','-dc', input_filename]
|
||||
cmd = ["xzcat", '-dk', '-T 20', input_filename]
|
||||
elif re.match(r'.*\.gz', input_filename):
|
||||
cmd = ["zcat", input_filename]
|
||||
else:
|
||||
return open(input_filename, 'r')
|
||||
|
||||
try:
|
||||
input_file = Popen(cmd, stdout=PIPE).stdout
|
||||
return Popen(cmd, stdout=PIPE).stdout
|
||||
except NameError as e:
|
||||
print(e)
|
||||
input_file = open(input_filename, 'r')
|
||||
return input_file
|
||||
return open(input_filename, 'r')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user