cdsc_reddit/datasets/comments_part2.py

#!/usr/bin/env python3
"""Part 2 for comments: Spark sort + repartition into the final datasets.

Must be launched from a login node via the Hyak-provided wrapper:
  start_spark_and_run.sh 1 comments_part2.py
  start_spark_and_run.sh 1 comments_part2.py --indir=/path/to/parquets --mode=append

--indir defaults to the temp comments dir in dumps_helper.py.
--mode defaults to 'overwrite'; use 'append' to add a new layer without
touching existing partition files (see add_months.sh).
"""

import fire
from dumps_helper import COMMENTS, sort_and_write


if __name__ == "__main__":
    fire.Fire(lambda indir=None, mode='overwrite': sort_and_write(COMMENTS, indir=indir, mode=mode))