datasets/add_months.sh: fail on leftover files, add --clean to wipe them
Without --clean, the script now exits with a clear error if temp or staging directories from a previous run exist. Pass --clean to remove them automatically before starting. README example updated to include the flag. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -115,7 +115,7 @@ srun -p cpu-g2 -A comdata --nodes=1 --time=72:00:00 -c 112 --mem=400G \
|
|||||||
PYTHON=/gscratch/comdata/users/makohill/cdsc_reddit/venv/bin/python3 \
|
PYTHON=/gscratch/comdata/users/makohill/cdsc_reddit/venv/bin/python3 \
|
||||||
COMMENTS_DUMPDIR=/path/to/new/comments \
|
COMMENTS_DUMPDIR=/path/to/new/comments \
|
||||||
SUBMISSIONS_DUMPDIR=/path/to/new/submissions \
|
SUBMISSIONS_DUMPDIR=/path/to/new/submissions \
|
||||||
./datasets/add_months.sh 2025-01 2025-02 ... YYYY-MM
|
./datasets/add_months.sh --clean 2025-01 2025-02 ... YYYY-MM
|
||||||
" 2>&1 | tee /gscratch/comdata/users/makohill/add_months_run.log
|
" 2>&1 | tee /gscratch/comdata/users/makohill/add_months_run.log
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -6,11 +6,14 @@
|
|||||||
# add_months_multinode.sh.
|
# add_months_multinode.sh.
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# add_months.sh YYYY-MM [YYYY-MM ...]
|
# add_months.sh [--clean] YYYY-MM [YYYY-MM ...]
|
||||||
#
|
#
|
||||||
# Example:
|
# Example:
|
||||||
# add_months.sh 2025-01 2025-02 2025-03
|
# add_months.sh 2025-01 2025-02 2025-03
|
||||||
#
|
#
|
||||||
|
# If temp or staging directories from a previous run exist, the script
|
||||||
|
# will exit with an error. Pass --clean to wipe them before starting:
|
||||||
|
#
|
||||||
# The new .zst dump files must live at:
|
# The new .zst dump files must live at:
|
||||||
# $COMMENTS_DUMPDIR/RC_YYYY-MM.zst
|
# $COMMENTS_DUMPDIR/RC_YYYY-MM.zst
|
||||||
# $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst
|
# $SUBMISSIONS_DUMPDIR/RS_YYYY-MM.zst
|
||||||
@@ -37,8 +40,14 @@
|
|||||||
set -e
|
set -e
|
||||||
cd "$(dirname "$0")"
|
cd "$(dirname "$0")"
|
||||||
|
|
||||||
|
CLEAN=0
|
||||||
|
if [ "${1:-}" = "--clean" ]; then
|
||||||
|
CLEAN=1
|
||||||
|
shift
|
||||||
|
fi
|
||||||
|
|
||||||
if [ $# -eq 0 ]; then
|
if [ $# -eq 0 ]; then
|
||||||
echo "Usage: $0 YYYY-MM [YYYY-MM ...]" >&2
|
echo "Usage: $0 [--clean] YYYY-MM [YYYY-MM ...]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -62,6 +71,29 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet"
|
|||||||
LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet"
|
LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet"
|
||||||
LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet"
|
LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet"
|
||||||
|
|
||||||
|
# --- Check for leftover output from a previous run --------------------------
|
||||||
|
|
||||||
|
EXISTING=()
|
||||||
|
for d in "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" \
|
||||||
|
"$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" \
|
||||||
|
"$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"; do
|
||||||
|
[ -e "$d" ] && EXISTING+=("$d")
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ ${#EXISTING[@]} -gt 0 ]; then
|
||||||
|
if [ $CLEAN -eq 1 ]; then
|
||||||
|
echo "Removing leftover files from previous run..."
|
||||||
|
rm -rf "${EXISTING[@]}"
|
||||||
|
rm -f add_months_tasks.txt add_months_joblog.txt
|
||||||
|
rm -rf add_months_logs/
|
||||||
|
else
|
||||||
|
echo "Error: leftover files from a previous run exist:" >&2
|
||||||
|
printf ' %s\n' "${EXISTING[@]}" >&2
|
||||||
|
echo "Re-run with --clean to remove them before starting." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# --- Part 1: parse new months in parallel (comments and submissions together) -
|
# --- Part 1: parse new months in parallel (comments and submissions together) -
|
||||||
|
|
||||||
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \
|
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \
|
||||||
|
|||||||
Reference in New Issue
Block a user