18
0

datasets/: add PYTHON variable to add_months scripts

GNU parallel spawns fresh shells that don't inherit the active venv.
Using an explicit PYTHON path ensures the right interpreter is used in
parallel tasks. Defaults to python3 but can be overridden:

  PYTHON=/path/to/venv/bin/python3 ./add_months.sh ...

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 18:42:05 -07:00
parent 926e9bc364
commit 18925dfe5b
2 changed files with 6 additions and 4 deletions

View File

@@ -34,6 +34,7 @@ MONTHS=("$@")
COMMENTS_DUMPDIR="${COMMENTS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/comments}"
SUBMISSIONS_DUMPDIR="${SUBMISSIONS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/submissions}"
PYTHON="${PYTHON:-python3}"
TEMP_COMMENTS="/gscratch/comdata/output/temp/add_months_comments.parquet"
TEMP_SUBMISSIONS="/gscratch/comdata/output/temp/add_months_submissions.parquet"
@@ -48,10 +49,10 @@ LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.par
# --- Part 1: parse new months in parallel -----------------------------------
printf "python3 comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "${MONTHS[@]}" \
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "${MONTHS[@]}" \
> add_months_comments_tasks.txt
printf "python3 submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "${MONTHS[@]}" \
printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "${MONTHS[@]}" \
> add_months_submissions_tasks.txt
parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \