Replace the four per-type scripts (comments/submissions x part1/part2) with two merged scripts that share all of their plumbing — only the schema and JSON parser differ between types. Drop the per-source part rolling; one parquet per input zst, since Spark handles big parquet files via internal row groups. Add two thin runner scripts for the two common workflows: build_from_scratch.sh wipes the temp dirs and processes everything, add_new_month.sh takes YYYY-MM and parses just that month before re-running the Spark sort. Every step in the runners is a separate command so individual stages can be copied out and run standalone for debugging. Also fixes several lurking bugs in the original code: the hardcoded /gscratch/comdata/users/nathante/ output path in comments Part 2; the df2 = df.sortWithinPartitions typo in submissions Part 2 that threw away the preceding global sort; references to a missing parse_submissions.sh in the old .sh runners; and the asymmetry where comments_2_parquet_part1.py wasn't per-file/fire-driven the way submissions_2_parquet_part1.py was. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
74 lines
3.1 KiB
Python
Executable File
74 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Stage 2 of the dump-to-parquet pipeline: read the per-source parquet
|
|
# files produced by parquet_part1.py, sort by subreddit and by author
|
|
# (two passes), and write the final repartitioned parquet datasets.
|
|
#
|
|
# CLI:
|
|
# parquet_part2.py comments
|
|
# parquet_part2.py submissions
|
|
#
|
|
# This is a Spark job; launch via start_spark_and_run.sh.
|
|
|
|
import fire
|
|
import pyspark
|
|
from pyspark.sql import functions as f
|
|
from pyspark.sql import SparkSession
|
|
|
|
|
|
TYPES = {
|
|
'comments': {
|
|
'input_dir': "/gscratch/comdata/output/temp/reddit_comments.parquet",
|
|
'output_by_subreddit': "/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",
|
|
'output_by_author': "/gscratch/comdata/output/reddit_comments_by_author.parquet",
|
|
'subreddit_sort_keys': ["subreddit", "CreatedAt", "link_id", "parent_id", "Year", "Month", "Day"],
|
|
'author_sort_keys': ["author", "CreatedAt", "subreddit", "link_id", "parent_id", "Year", "Month", "Day"],
|
|
'app_name': "Reddit comments to parquet",
|
|
},
|
|
'submissions': {
|
|
'input_dir': "/gscratch/comdata/output/temp/reddit_submissions.parquet",
|
|
'output_by_subreddit': "/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet",
|
|
'output_by_author': "/gscratch/comdata/output/reddit_submissions_by_author.parquet",
|
|
'subreddit_sort_keys': ["subreddit", "CreatedAt", "id"],
|
|
'author_sort_keys': ["author", "CreatedAt", "id"],
|
|
'app_name': "Reddit submissions to parquet",
|
|
},
|
|
}
|
|
|
|
|
|
def sort_and_write(dump_type):
|
|
config = TYPES[dump_type]
|
|
|
|
spark = SparkSession.builder.appName(config['app_name']).getOrCreate()
|
|
sc = spark.sparkContext
|
|
pyspark.SparkConf().set("spark.sql.shuffle.partitions", 2000)
|
|
pyspark.SparkConf().set('spark.sql.crossJoin.enabled', "true")
|
|
pyspark.SparkConf().set('spark.debug.maxToStringFields', 200)
|
|
|
|
df = spark.read.parquet(config['input_dir'], compression='snappy')
|
|
|
|
df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
|
|
df = df.drop('subreddit')
|
|
df = df.withColumnRenamed('subreddit_2', 'subreddit')
|
|
|
|
df = df.withColumnRenamed("created_utc", "CreatedAt")
|
|
df = df.withColumn("Month", f.month(f.col("CreatedAt")))
|
|
df = df.withColumn("Year", f.year(f.col("CreatedAt")))
|
|
df = df.withColumn("Day", f.dayofmonth(f.col("CreatedAt")))
|
|
|
|
# sort + repartition by subreddit
|
|
df_by_subreddit = df.repartition('subreddit')
|
|
df_by_subreddit = df_by_subreddit.sort(config['subreddit_sort_keys'], ascending=True)
|
|
df_by_subreddit = df_by_subreddit.sortWithinPartitions(config['subreddit_sort_keys'], ascending=True)
|
|
df_by_subreddit.write.parquet(config['output_by_subreddit'], mode='overwrite', compression='snappy')
|
|
|
|
# sort + repartition by author
|
|
df_by_author = df.repartition('author')
|
|
df_by_author = df_by_author.sort(config['author_sort_keys'], ascending=True)
|
|
df_by_author = df_by_author.sortWithinPartitions(config['author_sort_keys'], ascending=True)
|
|
df_by_author.write.parquet(config['output_by_author'], mode='overwrite', compression='snappy')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire(sort_and_write)
|