#!/usr/bin/env bash # # Add a single new month of dumps to the existing parquet datasets. # # Processes only the RC_.zst and RS_.zst files (Part 1), # leaving the existing per-source temp parquet files untouched, then # re-runs the Part 2 Spark sort + repartition over the full temp dir so # the final by_subreddit / by_author datasets pick up the new data. # # Usage: # add_new_month.sh YYYY-MM # # Example: # add_new_month.sh 2025-03 # # Every command below is independently runnable — to debug, copy a line # out and run it directly. For a full rebuild instead, see # build_from_scratch.sh. # # Note on cost: Part 2 always re-sorts the full corpus (the sort is global, # not incremental), so this gets slightly slower each month. For the # monthly cadence this is fine; if the sort becomes a bottleneck we'd # need to rearchitect Part 2 to merge-append instead of re-sort. set -e cd "$(dirname "$0")" MONTH="${1:-}" if [ -z "$MONTH" ]; then echo "Usage: $0 YYYY-MM" >&2 exit 1 fi # --- Part 1: parse the new month's dumps (no wipe) ------------------------- # parse the new comments file python3 comments_part1.py parse_dump "RC_${MONTH}.zst" # parse the new submissions file python3 submissions_part1.py parse_dump "RS_${MONTH}.zst" # --- Part 2: re-sort the full corpus including the new data --------------- # sort comments and overwrite reddit_comments_by_{subreddit,author}.parquet start_spark_and_run.sh 1 comments_part2.py # sort submissions and overwrite reddit_submissions_by_{subreddit,author}.parquet start_spark_and_run.sh 1 submissions_part2.py