diff --git a/datasets/add_months.sh b/datasets/add_months.sh index 0065d24..540940e 100755 --- a/datasets/add_months.sh +++ b/datasets/add_months.sh @@ -44,6 +44,7 @@ fi COMMENTS_DUMPDIR="${COMMENTS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/comments}" SUBMISSIONS_DUMPDIR="${SUBMISSIONS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/submissions}" +PYTHON="${PYTHON:-python3}" # Part 1 temp dirs (per-month parquets, parsed from .zst) TEMP_COMMENTS="/gscratch/comdata/output/temp/add_months_comments.parquet" @@ -63,10 +64,10 @@ LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.par # --- Part 1: parse new months in parallel ----------------------------------- -printf "python3 comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ +printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ > add_months_comments_tasks.txt -printf "python3 submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ +printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ > add_months_submissions_tasks.txt parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \ diff --git a/datasets/add_months_multinode.sh b/datasets/add_months_multinode.sh index e16b373..6dca436 100755 --- a/datasets/add_months_multinode.sh +++ b/datasets/add_months_multinode.sh @@ -34,6 +34,7 @@ MONTHS=("$@") COMMENTS_DUMPDIR="${COMMENTS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/comments}" SUBMISSIONS_DUMPDIR="${SUBMISSIONS_DUMPDIR:-/gscratch/comdata/raw_data/reddit_dumps/submissions}" +PYTHON="${PYTHON:-python3}" TEMP_COMMENTS="/gscratch/comdata/output/temp/add_months_comments.parquet" TEMP_SUBMISSIONS="/gscratch/comdata/output/temp/add_months_submissions.parquet" @@ -48,10 +49,10 @@ LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.par # --- Part 1: parse new months in parallel ----------------------------------- -printf "python3 comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "${MONTHS[@]}" \ +printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "${MONTHS[@]}" \ > add_months_comments_tasks.txt -printf "python3 submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "${MONTHS[@]}" \ +printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "${MONTHS[@]}" \ > add_months_submissions_tasks.txt parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \