#!/usr/bin/env bash # # Add one or more new months to the existing parquet datasets using a # layered append. Part 1 runs on a compute node; Part 2 must be launched # from a login node (it calls salloc via start_spark_and_run.sh). # # Usage: # add_months.sh YYYY-MM [YYYY-MM ...] # # Example: # add_months.sh 2025-01 2025-02 2025-03 # # The new .zst dump files must live at: # $COMMENTS_DUMPDIR/RC_YYYY-MM.zst # $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst # # Override the dump directories via environment variables if the new files # are not in the standard locations: # # COMMENTS_DUMPDIR=/path/to/new/comments \ # SUBMISSIONS_DUMPDIR=/path/to/new/submissions \ # ./add_months.sh 2025-01 2025-02 # # How layering works: Part 2 appends a new set of sorted partition files # alongside the existing ones. Spark and DuckDB read all layers together # transparently. Run merge_layers.sh to collapse layers into one when n # gets large. Run build_from_scratch.sh to rebuild everything from raw dumps. # # NOTE: This script and its workflow are written but not yet tested. # Remove this notice after a successful end-to-end run. # # Every command below is independently runnable for debugging. set -e cd "$(dirname "$0")" if [ $# -eq 0 ]; then echo "Usage: $0 YYYY-MM [YYYY-MM ...]" >&2 exit 1 fi COMMENTS_DUMPDIR="${COMMENTS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/comments}" SUBMISSIONS_DUMPDIR="${SUBMISSIONS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/submissions}" TEMP_COMMENTS="/gscratch/comdata/output/temp/add_months_comments.parquet" TEMP_SUBMISSIONS="/gscratch/comdata/output/temp/add_months_submissions.parquet" # --- Part 1: parse new months in parallel ----------------------------------- # build task lists for the specified months printf "python3 comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ > add_months_comments_tasks.txt printf "python3 submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ > add_months_submissions_tasks.txt # parse all new comment months in parallel parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \ < add_months_comments_tasks.txt # parse all new submission months in parallel parallel --joblog add_months_submissions_joblog.txt --results add_months_submissions_logs \ < add_months_submissions_tasks.txt # --- Part 2: sort and append new layer (run from a login node) -------------- # # start_spark_and_run.sh calls salloc; run these two lines from a login node, # not from within an existing compute allocation. # append new comment layer to reddit_comments_by_{subreddit,author}.parquet start_spark_and_run.sh 1 comments_part2.py --indir="$TEMP_COMMENTS" --mode=append # append new submission layer to reddit_submissions_by_{subreddit,author}.parquet start_spark_and_run.sh 1 submissions_part2.py --indir="$TEMP_SUBMISSIONS" --mode=append # --- cleanup: remove temporary Part 1 files --------------------------------- rm -rf "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS"