from subprocess import Popen, PIPE import re from collections import defaultdict from os import path import glob import io import zstandard def find_dumps(dumpdir, base_pattern): files = glob.glob(path.join(dumpdir,base_pattern)) # build a dictionary of possible extensions for each dump dumpext = defaultdict(list) for fpath in files: fname, ext = path.splitext(fpath) dumpext[fname].append(ext) ext_priority = ['.zst','.xz','.bz2','.gz'] for base, exts in dumpext.items(): ext = [ext for ext in ext_priority if ext in exts][0] yield base + ext def open_fileset(files): for fh in files: print(fh) lines = open_input_file(fh) for line in lines: yield line def open_input_file(input_filename): # .zst handled via the zstandard library to avoid subprocess/container issues if re.match(r'.*\.zst$', input_filename): fh = open(input_filename, 'rb') dctx = zstandard.ZstdDecompressor() return io.TextIOWrapper(dctx.stream_reader(fh), encoding='utf-8') if re.match(r'.*\.7z$', input_filename): cmd = ["7za", "x", "-so", input_filename, '*'] elif re.match(r'.*\.bz2$', input_filename): cmd = ["bzcat", "-dk", input_filename] elif re.match(r'.*\.bz', input_filename): cmd = ["bzcat", "-dk", input_filename] elif re.match(r'.*\.xz', input_filename): cmd = ["xzcat", '-dk', '-T 20', input_filename] elif re.match(r'.*\.gz', input_filename): cmd = ["zcat", input_filename] else: return open(input_filename, 'r') try: return Popen(cmd, stdout=PIPE).stdout except NameError as e: print(e) return open(input_filename, 'r')