cdsc_reddit/datasets/comments_part2.py

#!/usr/bin/env python3
"""Part 2 for comments: Spark sort + repartition the per-source parquets
produced by comments_part1.py into the final by_subreddit / by_author
datasets.

Launched via the Hyak-provided start_spark_and_run.sh wrapper:
  start_spark_and_run.sh 1 comments_part2.py
"""

from dumps_helper import COMMENTS, sort_and_write


if __name__ == "__main__":
    sort_and_write(COMMENTS)