cdsc_reddit/datasets/merge_layers.sh

#!/usr/bin/env bash
#
# Collapse all accumulated layers in the final parquet datasets into a
# single clean layer. Use this after several incremental adds via
# add_months.sh when you want to reduce the number of partition files.
#
# Reads the existing by_subreddit / by_author datasets, re-sorts everything,
# writes to temp paths, then atomically replaces the originals via rename.
# The old directories are removed once the new ones are in place.
#
# If the process is interrupted after writing the .merging directories but
# before the renames complete, re-run — the .merging directories will be
# overwritten and the originals are still intact. If interrupted after the
# renames, the .old directories are left behind; delete them manually once
# satisfied with the output.
#
# To add new months without merging, use add_months.sh.
# To rebuild everything from raw dumps, use build_from_scratch.sh.
#
# NOTE: This script and its workflow are written but not yet tested.
# Remove this notice after a successful end-to-end run.
#
# Every command below is independently runnable for debugging.

set -e
cd "$(dirname "$0")"

# merge and collapse comments layers
start_spark_and_run.sh 1 comments_merge.py

# merge and collapse submissions layers
start_spark_and_run.sh 1 submissions_merge.py