18
0

datasets/add_months.sh: fail on leftover files, add --clean to wipe them

Without --clean, the script now exits with a clear error if temp or
staging directories from a previous run exist. Pass --clean to remove
them automatically before starting. README example updated to include
the flag.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 19:10:16 -07:00
parent 6c6e05c360
commit 0ea57b2377
2 changed files with 35 additions and 3 deletions

View File

@@ -115,7 +115,7 @@ srun -p cpu-g2 -A comdata --nodes=1 --time=72:00:00 -c 112 --mem=400G \
PYTHON=/gscratch/comdata/users/makohill/cdsc_reddit/venv/bin/python3 \ PYTHON=/gscratch/comdata/users/makohill/cdsc_reddit/venv/bin/python3 \
COMMENTS_DUMPDIR=/path/to/new/comments \ COMMENTS_DUMPDIR=/path/to/new/comments \
SUBMISSIONS_DUMPDIR=/path/to/new/submissions \ SUBMISSIONS_DUMPDIR=/path/to/new/submissions \
./datasets/add_months.sh 2025-01 2025-02 ... YYYY-MM ./datasets/add_months.sh --clean 2025-01 2025-02 ... YYYY-MM
" 2>&1 | tee /gscratch/comdata/users/makohill/add_months_run.log " 2>&1 | tee /gscratch/comdata/users/makohill/add_months_run.log
``` ```

View File

@@ -6,11 +6,14 @@
# add_months_multinode.sh. # add_months_multinode.sh.
# #
# Usage: # Usage:
# add_months.sh YYYY-MM [YYYY-MM ...] # add_months.sh [--clean] YYYY-MM [YYYY-MM ...]
# #
# Example: # Example:
# add_months.sh 2025-01 2025-02 2025-03 # add_months.sh 2025-01 2025-02 2025-03
# #
# If temp or staging directories from a previous run exist, the script
# will exit with an error. Pass --clean to wipe them before starting:
#
# The new .zst dump files must live at: # The new .zst dump files must live at:
# $COMMENTS_DUMPDIR/RC_YYYY-MM.zst # $COMMENTS_DUMPDIR/RC_YYYY-MM.zst
# $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst # $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst
@@ -37,8 +40,14 @@
set -e set -e
cd "$(dirname "$0")" cd "$(dirname "$0")"
CLEAN=0
if [ "${1:-}" = "--clean" ]; then
CLEAN=1
shift
fi
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
echo "Usage: $0 YYYY-MM [YYYY-MM ...]" >&2 echo "Usage: $0 [--clean] YYYY-MM [YYYY-MM ...]" >&2
exit 1 exit 1
fi fi
@@ -62,6 +71,29 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet"
LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet" LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet"
LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet" LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet"
# --- Check for leftover output from a previous run --------------------------
EXISTING=()
for d in "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" \
"$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" \
"$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"; do
[ -e "$d" ] && EXISTING+=("$d")
done
if [ ${#EXISTING[@]} -gt 0 ]; then
if [ $CLEAN -eq 1 ]; then
echo "Removing leftover files from previous run..."
rm -rf "${EXISTING[@]}"
rm -f add_months_tasks.txt add_months_joblog.txt
rm -rf add_months_logs/
else
echo "Error: leftover files from a previous run exist:" >&2
printf ' %s\n' "${EXISTING[@]}" >&2
echo "Re-run with --clean to remove them before starting." >&2
exit 1
fi
fi
# --- Part 1: parse new months in parallel (comments and submissions together) - # --- Part 1: parse new months in parallel (comments and submissions together) -
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \