cdsc_reddit/datasets/helper.py

from subprocess import Popen, PIPE
import re
from collections import defaultdict
from os import path
import glob

def find_dumps(dumpdir, base_pattern):

    files = glob.glob(path.join(dumpdir,base_pattern))

    # build a dictionary of possible extensions for each dump
    dumpext = defaultdict(list)
    for fpath in files:
        fname, ext = path.splitext(fpath)
        dumpext[fname].append(ext)

    ext_priority = ['.zst','.xz','.bz2','.gz']

    for base, exts in dumpext.items():
        ext = [ext for ext in ext_priority if ext in exts][0]
        yield base + ext

def open_fileset(files):
    for fh in files:
        print(fh)
        lines = open_input_file(fh)
        for line in lines:
            yield line

def open_input_file(input_filename):
    if re.match(r'.*\.7z$', input_filename):
        cmd = ["7za", "x", "-so", input_filename, '*'] 
    elif re.match(r'.*\.gz$', input_filename):
        cmd = ["zcat", input_filename] 
    elif re.match(r'.*\.bz2$', input_filename):
        cmd = ["bzcat", "-dk", input_filename] 
    elif re.match(r'.*\.bz', input_filename):
        cmd = ["bzcat", "-dk", input_filename] 
    elif re.match(r'.*\.xz', input_filename):
        cmd = ["xzcat",'-dk', '-T 20',input_filename]
    elif re.match(r'.*\.zst',input_filename):
        cmd = ['zstd','-dck', input_filename]
    elif re.match(r'.*\.gz',input_filename):
        cmd = ['gzip','-dc', input_filename]
    try:
        input_file = Popen(cmd, stdout=PIPE).stdout
    except NameError as e:
        print(e)
        input_file = open(input_filename, 'r')
    return input_file
Build comments dataset similarly to submissions and improve partitioning scheme 2020-07-07 18:45:43 +00:00			`from subprocess import Popen, PIPE`
			`import re`
			`from collections import defaultdict`
			`from os import path`
			`import glob`

			`def find_dumps(dumpdir, base_pattern):`

			`files = glob.glob(path.join(dumpdir,base_pattern))`

			`# build a dictionary of possible extensions for each dump`
			`dumpext = defaultdict(list)`
			`for fpath in files:`
			`fname, ext = path.splitext(fpath)`
			`dumpext[fname].append(ext)`

git-annex in nathante@mox2.hyak.local:/gscratch/comdata/users/nathante/cdsc-reddit 2020-11-12 00:39:44 +00:00			`ext_priority = ['.zst','.xz','.bz2','.gz']`
Build comments dataset similarly to submissions and improve partitioning scheme 2020-07-07 18:45:43 +00:00
			`for base, exts in dumpext.items():`
Bugfixes in scripts. 2020-07-08 06:29:36 +00:00			`ext = [ext for ext in ext_priority if ext in exts][0]`
			`yield base + ext`
Build comments dataset similarly to submissions and improve partitioning scheme 2020-07-07 18:45:43 +00:00
			`def open_fileset(files):`
			`for fh in files:`
			`print(fh)`
			`lines = open_input_file(fh)`
			`for line in lines:`
			`yield line`

			`def open_input_file(input_filename):`
			`if re.match(r'.*\.7z$', input_filename):`
			`cmd = ["7za", "x", "-so", input_filename, '*']`
			`elif re.match(r'.*\.gz$', input_filename):`
			`cmd = ["zcat", input_filename]`
			`elif re.match(r'.*\.bz2$', input_filename):`
			`cmd = ["bzcat", "-dk", input_filename]`
			`elif re.match(r'.*\.bz', input_filename):`
			`cmd = ["bzcat", "-dk", input_filename]`
			`elif re.match(r'.*\.xz', input_filename):`
			`cmd = ["xzcat",'-dk', '-T 20',input_filename]`
			`elif re.match(r'.*\.zst',input_filename):`
			`cmd = ['zstd','-dck', input_filename]`
Update reddit comments data with daily dumps. 2020-10-03 23:42:22 +00:00			`elif re.match(r'.*\.gz',input_filename):`
			`cmd = ['gzip','-dc', input_filename]`
Build comments dataset similarly to submissions and improve partitioning scheme 2020-07-07 18:45:43 +00:00			`try:`
			`input_file = Popen(cmd, stdout=PIPE).stdout`
			`except NameError as e:`
			`print(e)`
			`input_file = open(input_filename, 'r')`
			`return input_file`