datasets/: replace add_new_month with layered append workflow

Add add_months.sh and merge_layers.sh implementing a layered append strategy for incremental dataset updates. Each incremental run appends new sorted partition files alongside existing ones rather than re-sorting the full corpus, which is prohibitively slow at this dataset scale. - dumps_helper.py: sort_and_write gains indir/mode params; new merge_layers function collapses accumulated layers via atomic rename - comments_part2.py, submissions_part2.py: expose --indir/--mode via Fire - add_months.sh: new layered append script (not yet tested) - merge_layers.sh: new layer collapse script (not yet tested) - comments_merge.py, submissions_merge.py: Spark entry points for merge - add_new_month.sh: deleted (full re-sort each add is redundant with build_from_scratch at corpus scale) - README.md: document three workflows; flag untested sections Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-25 17:59:36 -07:00
parent 1851132a06
commit 2d1d760142
10 changed files with 273 additions and 85 deletions
--- a/datasets/dumps_helper.py
+++ b/datasets/dumps_helper.py
@@ -10,6 +10,7 @@ task-list generator, and the Spark sort are all shared here.
 """

 import os
+import shutil
 from datetime import datetime
 from itertools import islice

@@ -255,16 +256,23 @@ def gen_task_list(config, script_name, dumpdir=None, tasklist=None):

 # --- Part 2: spark sort + repartition --------------------------------------

-def sort_and_write(config):
-    """Read the directory of per-source parquets, sort and repartition
-    twice (once by subreddit, once by author), and write the two final
-    datasets. Pyspark is imported lazily so Part 1 callers don't pay the
-    Spark startup cost."""
+def sort_and_write(config, indir=None, mode='overwrite'):
+    """Read a directory of per-source parquets, sort and repartition twice
+    (once by subreddit, once by author), and write the two final datasets.
+
+    indir defaults to config['outdir']. mode is passed to parquet write and
+    may be 'overwrite' (default, used by build_from_scratch) or 'append'
+    (used by add_months to layer new data alongside existing files).
+
+    Pyspark is imported lazily so Part 1 callers don't pay the Spark startup
+    cost.
+    """
    from pyspark.sql import SparkSession, functions as f

+    indir = indir or config['outdir']
    spark = SparkSession.builder.appName(config['app_name']).getOrCreate()

-    df = spark.read.parquet(config['outdir'], compression='snappy')
+    df = spark.read.parquet(indir, compression='snappy')

    df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
    df = df.drop('subreddit')
@@ -278,9 +286,54 @@ def sort_and_write(config):
    sub_keys = config['subreddit_sort_keys']
    df_sub = df.repartition('subreddit').sort(sub_keys, ascending=True)
    df_sub = df_sub.sortWithinPartitions(sub_keys, ascending=True)
-    df_sub.write.parquet(config['output_by_subreddit'], mode='overwrite', compression='snappy')
+    df_sub.write.parquet(config['output_by_subreddit'], mode=mode, compression='snappy')

    auth_keys = config['author_sort_keys']
    df_auth = df.repartition('author').sort(auth_keys, ascending=True)
    df_auth = df_auth.sortWithinPartitions(auth_keys, ascending=True)
-    df_auth.write.parquet(config['output_by_author'], mode='overwrite', compression='snappy')
+    df_auth.write.parquet(config['output_by_author'], mode=mode, compression='snappy')
+
+
+def merge_layers(config):
+    """Collapse all accumulated layers in the final datasets into a single
+    clean layer. Reads the existing by_subreddit dataset (which contains all
+    layers), re-sorts twice, writes to temp paths, then atomically replaces
+    the originals by renaming.
+
+    Safe to interrupt after the writes complete but before the renames — the
+    originals are untouched until the .merging directories exist. The .old
+    directories are left behind if the process is interrupted after renaming;
+    delete them manually once satisfied.
+
+    Pyspark is imported lazily so Part 1 callers don't pay the Spark startup
+    cost.
+    """
+    from pyspark.sql import SparkSession
+
+    spark = SparkSession.builder.appName(config['app_name'] + ' merge layers').getOrCreate()
+
+    # Both final datasets have identical rows; read from by_subreddit.
+    df = spark.read.parquet(config['output_by_subreddit'])
+
+    tmp_sub = config['output_by_subreddit'] + '.merging'
+    tmp_auth = config['output_by_author'] + '.merging'
+
+    sub_keys = config['subreddit_sort_keys']
+    df_sub = df.repartition('subreddit').sort(sub_keys, ascending=True)
+    df_sub = df_sub.sortWithinPartitions(sub_keys, ascending=True)
+    df_sub.write.parquet(tmp_sub, mode='overwrite', compression='snappy')
+
+    auth_keys = config['author_sort_keys']
+    df_auth = df.repartition('author').sort(auth_keys, ascending=True)
+    df_auth = df_auth.sortWithinPartitions(auth_keys, ascending=True)
+    df_auth.write.parquet(tmp_auth, mode='overwrite', compression='snappy')
+
+    # Atomic swap: rename old → .old, then .merging → final, then delete .old.
+    old_sub = config['output_by_subreddit'] + '.old'
+    old_auth = config['output_by_author'] + '.old'
+    os.rename(config['output_by_subreddit'], old_sub)
+    os.rename(tmp_sub, config['output_by_subreddit'])
+    os.rename(config['output_by_author'], old_auth)
+    os.rename(tmp_auth, config['output_by_author'])
+    shutil.rmtree(old_sub)
+    shutil.rmtree(old_auth)