cdsc_reddit/datasets/comments_part2.py

#!/usr/bin/env python3
"""Part 2 for comments: Spark sort + repartition into the final datasets.

Must be launched from a login node via the Hyak-provided wrapper:
  start_spark_and_run.sh 1 comments_part2.py
  start_spark_and_run.sh 1 comments_part2.py --indir=/path/to/parquets --mode=append

--indir defaults to the temp comments dir in dumps_helper.py.
--out_by_subreddit and --out_by_author default to the live dataset paths;
override them to write to staging directories first (see add_months.sh).
"""

import fire
from dumps_helper import COMMENTS, sort_and_write


if __name__ == "__main__":
    fire.Fire(lambda indir=None, out_by_subreddit=None, out_by_author=None:
              sort_and_write(COMMENTS, indir=indir,
                             out_by_subreddit=out_by_subreddit,
                             out_by_author=out_by_author))