18
0
Files

59 lines
1.7 KiB
Python
Raw Permalink Normal View History

from subprocess import Popen, PIPE
import re
from collections import defaultdict
from os import path
import glob
import io
import zstandard
def find_dumps(dumpdir, base_pattern):
files = glob.glob(path.join(dumpdir,base_pattern))
# build a dictionary of possible extensions for each dump
dumpext = defaultdict(list)
for fpath in files:
fname, ext = path.splitext(fpath)
dumpext[fname].append(ext)
ext_priority = ['.zst','.xz','.bz2','.gz']
for base, exts in dumpext.items():
2020-07-07 23:29:36 -07:00
ext = [ext for ext in ext_priority if ext in exts][0]
yield base + ext
def open_fileset(files):
for fh in files:
print(fh)
lines = open_input_file(fh)
for line in lines:
yield line
def open_input_file(input_filename):
# .zst handled via the zstandard library to avoid subprocess/container issues
if re.match(r'.*\.zst$', input_filename):
fh = open(input_filename, 'rb')
dctx = zstandard.ZstdDecompressor()
return io.TextIOWrapper(dctx.stream_reader(fh), encoding='utf-8')
if re.match(r'.*\.7z$', input_filename):
cmd = ["7za", "x", "-so", input_filename, '*']
elif re.match(r'.*\.bz2$', input_filename):
cmd = ["bzcat", "-dk", input_filename]
elif re.match(r'.*\.bz', input_filename):
cmd = ["bzcat", "-dk", input_filename]
elif re.match(r'.*\.xz', input_filename):
cmd = ["xzcat", '-dk', '-T 20', input_filename]
elif re.match(r'.*\.gz', input_filename):
cmd = ["zcat", input_filename]
else:
return open(input_filename, 'r')
try:
return Popen(cmd, stdout=PIPE).stdout
except NameError as e:
print(e)
return open(input_filename, 'r')