datasets/add_months.sh: fail on leftover files, add --clean to wipe them
Without --clean, the script now exits with a clear error if temp or staging directories from a previous run exist. Pass --clean to remove them automatically before starting. README example updated to include the flag. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,11 +6,14 @@
|
||||
# add_months_multinode.sh.
|
||||
#
|
||||
# Usage:
|
||||
# add_months.sh YYYY-MM [YYYY-MM ...]
|
||||
# add_months.sh [--clean] YYYY-MM [YYYY-MM ...]
|
||||
#
|
||||
# Example:
|
||||
# add_months.sh 2025-01 2025-02 2025-03
|
||||
#
|
||||
# If temp or staging directories from a previous run exist, the script
|
||||
# will exit with an error. Pass --clean to wipe them before starting:
|
||||
#
|
||||
# The new .zst dump files must live at:
|
||||
# $COMMENTS_DUMPDIR/RC_YYYY-MM.zst
|
||||
# $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst
|
||||
@@ -37,8 +40,14 @@
|
||||
set -e
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
CLEAN=0
|
||||
if [ "${1:-}" = "--clean" ]; then
|
||||
CLEAN=1
|
||||
shift
|
||||
fi
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
echo "Usage: $0 YYYY-MM [YYYY-MM ...]" >&2
|
||||
echo "Usage: $0 [--clean] YYYY-MM [YYYY-MM ...]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -62,6 +71,29 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet"
|
||||
LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet"
|
||||
LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet"
|
||||
|
||||
# --- Check for leftover output from a previous run --------------------------
|
||||
|
||||
EXISTING=()
|
||||
for d in "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" \
|
||||
"$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" \
|
||||
"$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"; do
|
||||
[ -e "$d" ] && EXISTING+=("$d")
|
||||
done
|
||||
|
||||
if [ ${#EXISTING[@]} -gt 0 ]; then
|
||||
if [ $CLEAN -eq 1 ]; then
|
||||
echo "Removing leftover files from previous run..."
|
||||
rm -rf "${EXISTING[@]}"
|
||||
rm -f add_months_tasks.txt add_months_joblog.txt
|
||||
rm -rf add_months_logs/
|
||||
else
|
||||
echo "Error: leftover files from a previous run exist:" >&2
|
||||
printf ' %s\n' "${EXISTING[@]}" >&2
|
||||
echo "Re-run with --clean to remove them before starting." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- Part 1: parse new months in parallel (comments and submissions together) -
|
||||
|
||||
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \
|
||||
|
||||
Reference in New Issue
Block a user