18
0

datasets/add_months.sh: fail on leftover files, add --clean to wipe them

Without --clean, the script now exits with a clear error if temp or
staging directories from a previous run exist. Pass --clean to remove
them automatically before starting. README example updated to include
the flag.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 19:10:16 -07:00
parent 6c6e05c360
commit 0ea57b2377
2 changed files with 35 additions and 3 deletions

View File

@@ -6,11 +6,14 @@
# add_months_multinode.sh.
#
# Usage:
# add_months.sh YYYY-MM [YYYY-MM ...]
# add_months.sh [--clean] YYYY-MM [YYYY-MM ...]
#
# Example:
# add_months.sh 2025-01 2025-02 2025-03
#
# If temp or staging directories from a previous run exist, the script
# will exit with an error. Pass --clean to wipe them before starting:
#
# The new .zst dump files must live at:
# $COMMENTS_DUMPDIR/RC_YYYY-MM.zst
# $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst
@@ -37,8 +40,14 @@
set -e
cd "$(dirname "$0")"
CLEAN=0
if [ "${1:-}" = "--clean" ]; then
CLEAN=1
shift
fi
if [ $# -eq 0 ]; then
echo "Usage: $0 YYYY-MM [YYYY-MM ...]" >&2
echo "Usage: $0 [--clean] YYYY-MM [YYYY-MM ...]" >&2
exit 1
fi
@@ -62,6 +71,29 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet"
LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet"
LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet"
# --- Check for leftover output from a previous run --------------------------
EXISTING=()
for d in "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" \
"$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" \
"$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"; do
[ -e "$d" ] && EXISTING+=("$d")
done
if [ ${#EXISTING[@]} -gt 0 ]; then
if [ $CLEAN -eq 1 ]; then
echo "Removing leftover files from previous run..."
rm -rf "${EXISTING[@]}"
rm -f add_months_tasks.txt add_months_joblog.txt
rm -rf add_months_logs/
else
echo "Error: leftover files from a previous run exist:" >&2
printf ' %s\n' "${EXISTING[@]}" >&2
echo "Re-run with --clean to remove them before starting." >&2
exit 1
fi
fi
# --- Part 1: parse new months in parallel (comments and submissions together) -
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \