18
0

datasets/: stage new layer before touching live datasets in add_months

Replace mode='append'-direct-to-live approach with a safer staging
workflow: Part 2 writes the new sorted layer to temp staging directories,
the user verifies, then a separate copy step adds the files to the live
datasets. Live datasets are never touched until the copy step, and the
copy only adds files — nothing is deleted or overwritten.

- sort_and_write gains out_by_subreddit/out_by_author params (replaces
  mode param) so Part 2 can target staging paths
- comments_part2.py, submissions_part2.py: expose new params via Fire
- add_months.sh: rewritten with explicit staging dirs, verify checkpoint,
  and find-based copy step

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 18:17:38 -07:00
parent 2d1d760142
commit 6b18840604
4 changed files with 89 additions and 33 deletions

View File

@@ -256,13 +256,14 @@ def gen_task_list(config, script_name, dumpdir=None, tasklist=None):
# --- Part 2: spark sort + repartition --------------------------------------
def sort_and_write(config, indir=None, mode='overwrite'):
def sort_and_write(config, indir=None, out_by_subreddit=None, out_by_author=None):
"""Read a directory of per-source parquets, sort and repartition twice
(once by subreddit, once by author), and write the two final datasets.
(once by subreddit, once by author), and write the two output datasets.
indir defaults to config['outdir']. mode is passed to parquet write and
may be 'overwrite' (default, used by build_from_scratch) or 'append'
(used by add_months to layer new data alongside existing files).
indir defaults to config['outdir'].
out_by_subreddit and out_by_author default to config['output_by_subreddit']
and config['output_by_author']. Override them to write to staging directories
instead of the live datasets (see add_months.sh).
Pyspark is imported lazily so Part 1 callers don't pay the Spark startup
cost.
@@ -270,6 +271,9 @@ def sort_and_write(config, indir=None, mode='overwrite'):
from pyspark.sql import SparkSession, functions as f
indir = indir or config['outdir']
out_by_subreddit = out_by_subreddit or config['output_by_subreddit']
out_by_author = out_by_author or config['output_by_author']
spark = SparkSession.builder.appName(config['app_name']).getOrCreate()
df = spark.read.parquet(indir, compression='snappy')
@@ -286,12 +290,12 @@ def sort_and_write(config, indir=None, mode='overwrite'):
sub_keys = config['subreddit_sort_keys']
df_sub = df.repartition('subreddit').sort(sub_keys, ascending=True)
df_sub = df_sub.sortWithinPartitions(sub_keys, ascending=True)
df_sub.write.parquet(config['output_by_subreddit'], mode=mode, compression='snappy')
df_sub.write.parquet(out_by_subreddit, mode='overwrite', compression='snappy')
auth_keys = config['author_sort_keys']
df_auth = df.repartition('author').sort(auth_keys, ascending=True)
df_auth = df_auth.sortWithinPartitions(auth_keys, ascending=True)
df_auth.write.parquet(config['output_by_author'], mode=mode, compression='snappy')
df_auth.write.parquet(out_by_author, mode='overwrite', compression='snappy')
def merge_layers(config):