From 4854d4f537d332ed3c1cb51201496f4c40c3478d Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Mon, 25 May 2026 18:51:56 -0700 Subject: [PATCH] datasets/add_months.sh: run comments and submissions Part 1 together Combine task lists and run a single parallel call so all 32 files (16 comments + 16 submissions) parse simultaneously. Co-Authored-By: Claude Sonnet 4.6 --- datasets/add_months.sh | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/datasets/add_months.sh b/datasets/add_months.sh index 540940e..499c00a 100755 --- a/datasets/add_months.sh +++ b/datasets/add_months.sh @@ -62,19 +62,16 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet" LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet" LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet" -# --- Part 1: parse new months in parallel ----------------------------------- +# --- Part 1: parse new months in parallel (comments and submissions together) - printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ - > add_months_comments_tasks.txt + > add_months_tasks.txt printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ - > add_months_submissions_tasks.txt + >> add_months_tasks.txt -parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \ - < add_months_comments_tasks.txt - -parallel --joblog add_months_submissions_joblog.txt --results add_months_submissions_logs \ - < add_months_submissions_tasks.txt +parallel --joblog add_months_joblog.txt --results add_months_logs \ + < add_months_tasks.txt # --- Part 2: sort new months into staging (Spark, single fat node) ---------- @@ -126,6 +123,8 @@ find "$STAGING_SUBMISSIONS_AUTH" -maxdepth 1 -type f -exec cp {} "$LIVE_SUBMISSI # # Run after confirming the copy succeeded and the live datasets look right. +rm -f add_months_tasks.txt add_months_joblog.txt +rm -rf add_months_logs/ rm -rf "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" rm -rf "$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" rm -rf "$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"