#!/usr/bin/env bash # # Build the sorted, partitioned Reddit parquet datasets from scratch. # # Wipes the per-source temp directories, processes every RC_* and RS_* dump # in the raw_data dumps directory through Part 1 (per-file, parallel), then # runs the Part 2 Spark sort + repartition for both comments and submissions. # # Every command below is independently runnable — to debug a single stage, # copy the line out and run it directly. Run the whole script end-to-end # only when you trust each step. # # Prerequisites: # - raw .zst dumps already staged in the dumpdir locations (see the # defaults in dumps_helper.py, or override via --dumpdir) # - GNU parallel installed # - start_spark_and_run.sh on PATH (Hyak-provided wrapper) # # To add new months to an existing build without rebuilding from scratch, # use add_months.sh. set -e cd "$(dirname "$0")" TEMP_COMMENTS="/gscratch/comdata/output/temp/reddit_comments.parquet" TEMP_SUBMISSIONS="/gscratch/comdata/output/temp/reddit_submissions.parquet" # --- Part 1a: comments ------------------------------------------------------ # wipe any existing comments temp output rm -rf "$TEMP_COMMENTS" # generate the per-file parse task list python3 comments_part1.py gen_task_list # run all comments parse tasks in parallel parallel --joblog comments_joblog.txt --results comments_logs < parse_comments_task_list # --- Part 1b: submissions --------------------------------------------------- # wipe any existing submissions temp output rm -rf "$TEMP_SUBMISSIONS" # generate the per-file parse task list python3 submissions_part1.py gen_task_list # run all submissions parse tasks in parallel parallel --joblog submissions_joblog.txt --results submissions_logs < parse_submissions_task_list # --- Part 2: spark sort + repartition -------------------------------------- # sort comments and write reddit_comments_by_{subreddit,author}.parquet start_spark_and_run.sh 1 comments_part2.py # sort submissions and write reddit_submissions_by_{subreddit,author}.parquet start_spark_and_run.sh 1 submissions_part2.py