#!/usr/bin/env python3 """Part 2 for submissions: Spark sort + repartition into the final datasets. Must be launched from a login node via the Hyak-provided wrapper: start_spark_and_run.sh 1 submissions_part2.py start_spark_and_run.sh 1 submissions_part2.py --indir=/path/to/parquets --mode=append --indir defaults to the temp submissions dir in dumps_helper.py. --out_by_subreddit and --out_by_author default to the live dataset paths; override them to write to staging directories first (see add_months.sh). """ import fire from dumps_helper import SUBMISSIONS, sort_and_write if __name__ == "__main__": fire.Fire(lambda indir=None, out_by_subreddit=None, out_by_author=None: sort_and_write(SUBMISSIONS, indir=indir, out_by_subreddit=out_by_subreddit, out_by_author=out_by_author))