diff --git a/datasets/add_months.sh b/datasets/add_months.sh index 540940e..499c00a 100755 --- a/datasets/add_months.sh +++ b/datasets/add_months.sh @@ -62,19 +62,16 @@ LIVE_COMMENTS_AUTH="/gscratch/comdata/output/reddit_comments_by_author.parquet" LIVE_SUBMISSIONS_SUB="/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet" LIVE_SUBMISSIONS_AUTH="/gscratch/comdata/output/reddit_submissions_by_author.parquet" -# --- Part 1: parse new months in parallel ----------------------------------- +# --- Part 1: parse new months in parallel (comments and submissions together) - printf "$PYTHON comments_part1.py parse_dump RC_%s.zst --dumpdir=\"$COMMENTS_DUMPDIR\" --outdir=\"$TEMP_COMMENTS\"\n" "$@" \ - > add_months_comments_tasks.txt + > add_months_tasks.txt printf "$PYTHON submissions_part1.py parse_dump RS_%s.zst --dumpdir=\"$SUBMISSIONS_DUMPDIR\" --outdir=\"$TEMP_SUBMISSIONS\"\n" "$@" \ - > add_months_submissions_tasks.txt + >> add_months_tasks.txt -parallel --joblog add_months_comments_joblog.txt --results add_months_comments_logs \ - < add_months_comments_tasks.txt - -parallel --joblog add_months_submissions_joblog.txt --results add_months_submissions_logs \ - < add_months_submissions_tasks.txt +parallel --joblog add_months_joblog.txt --results add_months_logs \ + < add_months_tasks.txt # --- Part 2: sort new months into staging (Spark, single fat node) ---------- @@ -126,6 +123,8 @@ find "$STAGING_SUBMISSIONS_AUTH" -maxdepth 1 -type f -exec cp {} "$LIVE_SUBMISSI # # Run after confirming the copy succeeded and the live datasets look right. +rm -f add_months_tasks.txt add_months_joblog.txt +rm -rf add_months_logs/ rm -rf "$TEMP_COMMENTS" "$TEMP_SUBMISSIONS" rm -rf "$STAGING_COMMENTS_SUB" "$STAGING_COMMENTS_AUTH" rm -rf "$STAGING_SUBMISSIONS_SUB" "$STAGING_SUBMISSIONS_AUTH"