#!/usr/bin/env bash # # Collapse all accumulated layers in the final parquet datasets into a # single clean layer. Use this after several incremental adds via # add_months.sh when you want to reduce the number of partition files. # # Reads the existing by_subreddit / by_author datasets, re-sorts everything, # writes to temp paths, then atomically replaces the originals via rename. # The old directories are removed once the new ones are in place. # # If the process is interrupted after writing the .merging directories but # before the renames complete, re-run — the .merging directories will be # overwritten and the originals are still intact. If interrupted after the # renames, the .old directories are left behind; delete them manually once # satisfied with the output. # # To add new months without merging, use add_months.sh. # To rebuild everything from raw dumps, use build_from_scratch.sh. # # NOTE: This script and its workflow are written but not yet tested. # Remove this notice after a successful end-to-end run. # # Every command below is independently runnable for debugging. set -e cd "$(dirname "$0")" # merge and collapse comments layers start_spark_and_run.sh 1 comments_merge.py # merge and collapse submissions layers start_spark_and_run.sh 1 submissions_merge.py