From 18925dfe5b6a83722847c34e54d87522de0ee000 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Mon, 25 May 2026 18:42:05 -0700 Subject: [PATCH] datasets/: add PYTHON variable to add_months scripts GNU parallel spawns fresh shells that don't inherit the active venv. Using an explicit PYTHON path ensures the right interpreter is used in parallel tasks. Defaults to python3 but can be overridden: PYTHON=/path/to/venv/bin/python3 ./add_months.sh ... Co-Authored-By: Claude Sonnet 4.6 --- datasets/add_months.sh | 5 +++-- datasets/add_months_multinode.sh | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/datasets/add_months.sh b/datasets/add_months.sh index 0065d24..540940e 100755 --- a/datasets/add_months.sh +++ b/datasets/add_months.sh @@ -44,6 +44,7 @@ fi COMMENTS_DUMPDIR="${COMMENTS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/comments}" SUBMISSIONS_DUMPDIR="${SUBMISSIONS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/submissions}" +PYTHON="${PYTHON:-python3}" # Part 1 temp dirs (per-month parquets, parsed from .zst) TEMP_COMMENTS="/gscratch/comdata/output/temp/add_months_comments.parquet" @@ -63,10 +64,10 @@ LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.par # --- Part 1: parse new months in parallel ----------------------------------- -printf "python3 comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ +printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ > add_months_comments_tasks.txt -printf "python3 submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ +printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ > add_months_submissions_tasks.txt parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \ diff --git a/datasets/add_months_multinode.sh b/datasets/add_months_multinode.sh index e16b373..6dca436 100755 --- a/datasets/add_months_multinode.sh +++ b/datasets/add_months_multinode.sh @@ -34,6 +34,7 @@ MONTHS=("$@") COMMENTS_DUMPDIR="${COMMENTS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/comments}" SUBMISSIONS_DUMPDIR="${SUBMISSIONS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/submissions}" +PYTHON="${PYTHON:-python3}" TEMP_COMMENTS="/gscratch/comdata/output/temp/add_months_comments.parquet" TEMP_SUBMISSIONS="/gscratch/comdata/output/temp/add_months_submissions.parquet" @@ -48,10 +49,10 @@ LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.par # --- Part 1: parse new months in parallel ----------------------------------- -printf "python3 comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "${MONTHS[@]}" \ +printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "${MONTHS[@]}" \ > add_months_comments_tasks.txt -printf "python3 submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "${MONTHS[@]}" \ +printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "${MONTHS[@]}" \ > add_months_submissions_tasks.txt parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \