cdsc_reddit/datasets/add_new_month.sh

#!/usr/bin/env bash
#
# Add a single new month of dumps to the existing parquet datasets.
#
# Processes only the RC_<month>.zst and RS_<month>.zst files (Part 1),
# leaving the existing per-source temp parquet files untouched, then
# re-runs the Part 2 Spark sort + repartition over the full temp dir so
# the final by_subreddit / by_author datasets pick up the new data.
#
# Usage:
#   add_new_month.sh YYYY-MM
#
# Example:
#   add_new_month.sh 2025-03
#
# Every command below is independently runnable — to debug, copy a line
# out and run it directly. For a full rebuild instead, see
# build_from_scratch.sh.
#
# Note on cost: Part 2 always re-sorts the full corpus (the sort is global,
# not incremental), so this gets slightly slower each month. For the
# monthly cadence this is fine; if the sort becomes a bottleneck we'd
# need to rearchitect Part 2 to merge-append instead of re-sort.

set -e
cd "$(dirname "$0")"

MONTH="${1:-}"
if [ -z "$MONTH" ]; then
    echo "Usage: $0 YYYY-MM" >&2
    exit 1
fi

# --- Part 1: parse the new month's dumps (no wipe) -------------------------

# parse the new comments file
python3 parquet_part1.py comments parse_dump "RC_${MONTH}.zst"

# parse the new submissions file
python3 parquet_part1.py submissions parse_dump "RS_${MONTH}.zst"

# --- Part 2: re-sort the full corpus including the new data ---------------

# sort comments and overwrite reddit_comments_by_{subreddit,author}.parquet
start_spark_and_run.sh 1 parquet_part2.py comments

# sort submissions and overwrite reddit_submissions_by_{subreddit,author}.parquet
start_spark_and_run.sh 1 parquet_part2.py submissions