From 0ea57b2377efa7a3eb15d008a4652ffb900cd59a Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Mon, 25 May 2026 19:10:16 -0700 Subject: [PATCH] datasets/add_months.sh: fail on leftover files, add --clean to wipe them Without --clean, the script now exits with a clear error if temp or staging directories from a previous run exist. Pass --clean to remove them automatically before starting. README example updated to include the flag. Co-Authored-By: Claude Sonnet 4.6 --- datasets/README.md | 2 +- datasets/add_months.sh | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/datasets/README.md b/datasets/README.md index 544eacb..ac37c79 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -115,7 +115,7 @@ srun -p cpu-g2 -A comdata --nodes=1 --time=72:00:00 -c 112 --mem=400G \ PYTHON=/gscratch/comdata/users/makohill/cdsc_reddit/venv/bin/python3 \ COMMENTS_DUMPDIR=/path/to/new/comments \ SUBMISSIONS_DUMPDIR=/path/to/new/submissions \ - ./datasets/add_months.sh 2025-01 2025-02 ... YYYY-MM + ./datasets/add_months.sh --clean 2025-01 2025-02 ... YYYY-MM " 2>&1 | tee /gscratch/comdata/users/makohill/add_months_run.log ``` diff --git a/datasets/add_months.sh b/datasets/add_months.sh index 499c00a..e23b761 100755 --- a/datasets/add_months.sh +++ b/datasets/add_months.sh @@ -6,11 +6,14 @@ # add_months_multinode.sh. # # Usage: -# add_months.sh YYYY-MM [YYYY-MM ...] +# add_months.sh [--clean] YYYY-MM [YYYY-MM ...] # # Example: # add_months.sh 2025-01 2025-02 2025-03 # +# If temp or staging directories from a previous run exist, the script +# will exit with an error. Pass --clean to wipe them before starting: +# # The new .zst dump files must live at: # $COMMENTS_DUMPDIR/RC_YYYY-MM.zst # $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst @@ -37,8 +40,14 @@ set -e cd "$(dirname "$0")" +CLEAN=0 +if [ "${1:-}" = "--clean" ]; then + CLEAN=1 + shift +fi + if [ $# -eq 0 ]; then - echo "Usage: $0 YYYY-MM [YYYY-MM ...]" >&2 + echo "Usage: $0 [--clean] YYYY-MM [YYYY-MM ...]" >&2 exit 1 fi @@ -62,6 +71,29 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet" LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet" LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet" +# --- Check for leftover output from a previous run -------------------------- + +EXISTING=() +for d in "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" \ + "$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" \ + "$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"; do + [ -e "$d" ] && EXISTING+=("$d") +done + +if [ ${#EXISTING[@]} -gt 0 ]; then + if [ $CLEAN -eq 1 ]; then + echo "Removing leftover files from previous run..." + rm -rf "${EXISTING[@]}" + rm -f add_months_tasks.txt add_months_joblog.txt + rm -rf add_months_logs/ + else + echo "Error: leftover files from a previous run exist:" >&2 + printf ' %s\n' "${EXISTING[@]}" >&2 + echo "Re-run with --clean to remove them before starting." >&2 + exit 1 + fi +fi + # --- Part 1: parse new months in parallel (comments and submissions together) - printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \