datasets/: replace add_new_month with layered append workflow
Add add_months.sh and merge_layers.sh implementing a layered append strategy for incremental dataset updates. Each incremental run appends new sorted partition files alongside existing ones rather than re-sorting the full corpus, which is prohibitively slow at this dataset scale. - dumps_helper.py: sort_and_write gains indir/mode params; new merge_layers function collapses accumulated layers via atomic rename - comments_part2.py, submissions_part2.py: expose --indir/--mode via Fire - add_months.sh: new layered append script (not yet tested) - merge_layers.sh: new layer collapse script (not yet tested) - comments_merge.py, submissions_merge.py: Spark entry points for merge - add_new_month.sh: deleted (full re-sort each add is redundant with build_from_scratch at corpus scale) - README.md: document three workflows; flag untested sections Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,7 @@ task-list generator, and the Spark sort are all shared here.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from itertools import islice
|
||||
|
||||
@@ -255,16 +256,23 @@ def gen_task_list(config, script_name, dumpdir=None, tasklist=None):
|
||||
|
||||
# --- Part 2: spark sort + repartition --------------------------------------
|
||||
|
||||
def sort_and_write(config):
|
||||
"""Read the directory of per-source parquets, sort and repartition
|
||||
twice (once by subreddit, once by author), and write the two final
|
||||
datasets. Pyspark is imported lazily so Part 1 callers don't pay the
|
||||
Spark startup cost."""
|
||||
def sort_and_write(config, indir=None, mode='overwrite'):
|
||||
"""Read a directory of per-source parquets, sort and repartition twice
|
||||
(once by subreddit, once by author), and write the two final datasets.
|
||||
|
||||
indir defaults to config['outdir']. mode is passed to parquet write and
|
||||
may be 'overwrite' (default, used by build_from_scratch) or 'append'
|
||||
(used by add_months to layer new data alongside existing files).
|
||||
|
||||
Pyspark is imported lazily so Part 1 callers don't pay the Spark startup
|
||||
cost.
|
||||
"""
|
||||
from pyspark.sql import SparkSession, functions as f
|
||||
|
||||
indir = indir or config['outdir']
|
||||
spark = SparkSession.builder.appName(config['app_name']).getOrCreate()
|
||||
|
||||
df = spark.read.parquet(config['outdir'], compression='snappy')
|
||||
df = spark.read.parquet(indir, compression='snappy')
|
||||
|
||||
df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
|
||||
df = df.drop('subreddit')
|
||||
@@ -278,9 +286,54 @@ def sort_and_write(config):
|
||||
sub_keys = config['subreddit_sort_keys']
|
||||
df_sub = df.repartition('subreddit').sort(sub_keys, ascending=True)
|
||||
df_sub = df_sub.sortWithinPartitions(sub_keys, ascending=True)
|
||||
df_sub.write.parquet(config['output_by_subreddit'], mode='overwrite', compression='snappy')
|
||||
df_sub.write.parquet(config['output_by_subreddit'], mode=mode, compression='snappy')
|
||||
|
||||
auth_keys = config['author_sort_keys']
|
||||
df_auth = df.repartition('author').sort(auth_keys, ascending=True)
|
||||
df_auth = df_auth.sortWithinPartitions(auth_keys, ascending=True)
|
||||
df_auth.write.parquet(config['output_by_author'], mode='overwrite', compression='snappy')
|
||||
df_auth.write.parquet(config['output_by_author'], mode=mode, compression='snappy')
|
||||
|
||||
|
||||
def merge_layers(config):
|
||||
"""Collapse all accumulated layers in the final datasets into a single
|
||||
clean layer. Reads the existing by_subreddit dataset (which contains all
|
||||
layers), re-sorts twice, writes to temp paths, then atomically replaces
|
||||
the originals by renaming.
|
||||
|
||||
Safe to interrupt after the writes complete but before the renames — the
|
||||
originals are untouched until the .merging directories exist. The .old
|
||||
directories are left behind if the process is interrupted after renaming;
|
||||
delete them manually once satisfied.
|
||||
|
||||
Pyspark is imported lazily so Part 1 callers don't pay the Spark startup
|
||||
cost.
|
||||
"""
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
spark = SparkSession.builder.appName(config['app_name'] + ' merge layers').getOrCreate()
|
||||
|
||||
# Both final datasets have identical rows; read from by_subreddit.
|
||||
df = spark.read.parquet(config['output_by_subreddit'])
|
||||
|
||||
tmp_sub = config['output_by_subreddit'] + '.merging'
|
||||
tmp_auth = config['output_by_author'] + '.merging'
|
||||
|
||||
sub_keys = config['subreddit_sort_keys']
|
||||
df_sub = df.repartition('subreddit').sort(sub_keys, ascending=True)
|
||||
df_sub = df_sub.sortWithinPartitions(sub_keys, ascending=True)
|
||||
df_sub.write.parquet(tmp_sub, mode='overwrite', compression='snappy')
|
||||
|
||||
auth_keys = config['author_sort_keys']
|
||||
df_auth = df.repartition('author').sort(auth_keys, ascending=True)
|
||||
df_auth = df_auth.sortWithinPartitions(auth_keys, ascending=True)
|
||||
df_auth.write.parquet(tmp_auth, mode='overwrite', compression='snappy')
|
||||
|
||||
# Atomic swap: rename old → .old, then .merging → final, then delete .old.
|
||||
old_sub = config['output_by_subreddit'] + '.old'
|
||||
old_auth = config['output_by_author'] + '.old'
|
||||
os.rename(config['output_by_subreddit'], old_sub)
|
||||
os.rename(tmp_sub, config['output_by_subreddit'])
|
||||
os.rename(config['output_by_author'], old_auth)
|
||||
os.rename(tmp_auth, config['output_by_author'])
|
||||
shutil.rmtree(old_sub)
|
||||
shutil.rmtree(old_auth)
|
||||
|
||||
Reference in New Issue
Block a user