datasets/: stage new layer before touching live datasets in add_months
Replace mode='append'-direct-to-live approach with a safer staging workflow: Part 2 writes the new sorted layer to temp staging directories, the user verifies, then a separate copy step adds the files to the live datasets. Live datasets are never touched until the copy step, and the copy only adds files — nothing is deleted or overwritten. - sort_and_write gains out_by_subreddit/out_by_author params (replaces mode param) so Part 2 can target staging paths - comments_part2.py, submissions_part2.py: expose new params via Fire - add_months.sh: rewritten with explicit staging dirs, verify checkpoint, and find-based copy step Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -256,13 +256,14 @@ def gen_task_list(config, script_name, dumpdir=None, tasklist=None):
|
||||
|
||||
# --- Part 2: spark sort + repartition --------------------------------------
|
||||
|
||||
def sort_and_write(config, indir=None, mode='overwrite'):
|
||||
def sort_and_write(config, indir=None, out_by_subreddit=None, out_by_author=None):
|
||||
"""Read a directory of per-source parquets, sort and repartition twice
|
||||
(once by subreddit, once by author), and write the two final datasets.
|
||||
(once by subreddit, once by author), and write the two output datasets.
|
||||
|
||||
indir defaults to config['outdir']. mode is passed to parquet write and
|
||||
may be 'overwrite' (default, used by build_from_scratch) or 'append'
|
||||
(used by add_months to layer new data alongside existing files).
|
||||
indir defaults to config['outdir'].
|
||||
out_by_subreddit and out_by_author default to config['output_by_subreddit']
|
||||
and config['output_by_author']. Override them to write to staging directories
|
||||
instead of the live datasets (see add_months.sh).
|
||||
|
||||
Pyspark is imported lazily so Part 1 callers don't pay the Spark startup
|
||||
cost.
|
||||
@@ -270,6 +271,9 @@ def sort_and_write(config, indir=None, mode='overwrite'):
|
||||
from pyspark.sql import SparkSession, functions as f
|
||||
|
||||
indir = indir or config['outdir']
|
||||
out_by_subreddit = out_by_subreddit or config['output_by_subreddit']
|
||||
out_by_author = out_by_author or config['output_by_author']
|
||||
|
||||
spark = SparkSession.builder.appName(config['app_name']).getOrCreate()
|
||||
|
||||
df = spark.read.parquet(indir, compression='snappy')
|
||||
@@ -286,12 +290,12 @@ def sort_and_write(config, indir=None, mode='overwrite'):
|
||||
sub_keys = config['subreddit_sort_keys']
|
||||
df_sub = df.repartition('subreddit').sort(sub_keys, ascending=True)
|
||||
df_sub = df_sub.sortWithinPartitions(sub_keys, ascending=True)
|
||||
df_sub.write.parquet(config['output_by_subreddit'], mode=mode, compression='snappy')
|
||||
df_sub.write.parquet(out_by_subreddit, mode='overwrite', compression='snappy')
|
||||
|
||||
auth_keys = config['author_sort_keys']
|
||||
df_auth = df.repartition('author').sort(auth_keys, ascending=True)
|
||||
df_auth = df_auth.sortWithinPartitions(auth_keys, ascending=True)
|
||||
df_auth.write.parquet(config['output_by_author'], mode=mode, compression='snappy')
|
||||
df_auth.write.parquet(out_by_author, mode='overwrite', compression='snappy')
|
||||
|
||||
|
||||
def merge_layers(config):
|
||||
|
||||
Reference in New Issue
Block a user