cdsc_reddit/datasets/build_from_scratch.sh

#!/usr/bin/env bash
#
# Build the sorted, partitioned Reddit parquet datasets from scratch.
#
# Wipes the per-source temp directories, processes every RC_* and RS_* dump
# in the raw_data dumps directory through Part 1 (per-file, parallel), then
# runs the Part 2 Spark sort + repartition for both comments and submissions.
#
# Every command below is independently runnable — to debug a single stage,
# copy the line out and run it directly. Run the whole script end-to-end
# only when you trust each step.
#
# Prerequisites:
# - raw .zst dumps already staged in the dumpdir locations (see the
#   parquet_part1.py defaults, or override via --dumpdir)
# - GNU parallel installed
# - start_spark_and_run.sh on PATH (Hyak-provided wrapper)
#
# To add one new month to an existing build instead of rebuilding from
# scratch, use add_new_month.sh.

set -e
cd "$(dirname "$0")"

TEMP_COMMENTS="/gscratch/comdata/output/temp/reddit_comments.parquet"
TEMP_SUBMISSIONS="/gscratch/comdata/output/temp/reddit_submissions.parquet"

# --- Part 1a: comments ------------------------------------------------------

# wipe any existing comments temp output
rm -rf "$TEMP_COMMENTS"

# generate the per-file parse task list
python3 parquet_part1.py comments gen_task_list

# run all comments parse tasks in parallel
parallel --joblog comments_joblog.txt --results comments_logs < parse_comments_task_list

# --- Part 1b: submissions ---------------------------------------------------

# wipe any existing submissions temp output
rm -rf "$TEMP_SUBMISSIONS"

# generate the per-file parse task list
python3 parquet_part1.py submissions gen_task_list

# run all submissions parse tasks in parallel
parallel --joblog submissions_joblog.txt --results submissions_logs < parse_submissions_task_list

# --- Part 2: spark sort + repartition --------------------------------------

# sort comments and write reddit_comments_by_{subreddit,author}.parquet
start_spark_and_run.sh 1 parquet_part2.py comments

# sort submissions and write reddit_submissions_by_{subreddit,author}.parquet
start_spark_and_run.sh 1 parquet_part2.py submissions