datasets/add_months.sh: run comments and submissions Part 1 together
Combine task lists and run a single parallel call so all 32 files (16 comments + 16 submissions) parse simultaneously. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -62,19 +62,16 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet"
|
||||
LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet"
|
||||
LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet"
|
||||
|
||||
# --- Part 1: parse new months in parallel -----------------------------------
|
||||
# --- Part 1: parse new months in parallel (comments and submissions together) -
|
||||
|
||||
printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \
|
||||
> add_months_comments_tasks.txt
|
||||
> add_months_tasks.txt
|
||||
|
||||
printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \
|
||||
> add_months_submissions_tasks.txt
|
||||
>> add_months_tasks.txt
|
||||
|
||||
parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \
|
||||
< add_months_comments_tasks.txt
|
||||
|
||||
parallel --joblog add_months_submissions_joblog.txt --results add_months_submissions_logs \
|
||||
< add_months_submissions_tasks.txt
|
||||
parallel --joblog add_months_joblog.txt --results add_months_logs \
|
||||
< add_months_tasks.txt
|
||||
|
||||
# --- Part 2: sort new months into staging (Spark, single fat node) ----------
|
||||
|
||||
@@ -126,6 +123,8 @@ find "$STAGING_SUBMISSIONS_AUTH" -maxdepth 1 -type f -exec cp {} "$LIVE_SUBMISSI
|
||||
#
|
||||
# Run after confirming the copy succeeded and the live datasets look right.
|
||||
|
||||
rm -f add_months_tasks.txt add_months_joblog.txt
|
||||
rm -rf add_months_logs/
|
||||
rm -rf "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS"
|
||||
rm -rf "$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH"
|
||||
rm -rf "$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"
|
||||
|
||||
Reference in New Issue
Block a user