#!/usr/bin/env python3 # Stage 2 of the dump-to-parquet pipeline: read the per-source parquet # files produced by parquet_part1.py, sort by subreddit and by author # (two passes), and write the final repartitioned parquet datasets. # # CLI: # parquet_part2.py comments # parquet_part2.py submissions # # This is a Spark job; launch via start_spark_and_run.sh. import fire import pyspark from pyspark.sql import functions as f from pyspark.sql import SparkSession TYPES = { 'comments': { 'input_dir': "/gscratch/comdata/output/temp/reddit_comments.parquet", 'output_by_subreddit': "/gscratch/comdata/output/reddit_comments_by_subreddit.parquet", 'output_by_author': "/gscratch/comdata/output/reddit_comments_by_author.parquet", 'subreddit_sort_keys': ["subreddit", "CreatedAt", "link_id", "parent_id", "Year", "Month", "Day"], 'author_sort_keys': ["author", "CreatedAt", "subreddit", "link_id", "parent_id", "Year", "Month", "Day"], 'app_name': "Reddit comments to parquet", }, 'submissions': { 'input_dir': "/gscratch/comdata/output/temp/reddit_submissions.parquet", 'output_by_subreddit': "/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet", 'output_by_author': "/gscratch/comdata/output/reddit_submissions_by_author.parquet", 'subreddit_sort_keys': ["subreddit", "CreatedAt", "id"], 'author_sort_keys': ["author", "CreatedAt", "id"], 'app_name': "Reddit submissions to parquet", }, } def sort_and_write(dump_type): config = TYPES[dump_type] spark = SparkSession.builder.appName(config['app_name']).getOrCreate() sc = spark.sparkContext pyspark.SparkConf().set("spark.sql.shuffle.partitions", 2000) pyspark.SparkConf().set('spark.sql.crossJoin.enabled', "true") pyspark.SparkConf().set('spark.debug.maxToStringFields', 200) df = spark.read.parquet(config['input_dir'], compression='snappy') df = df.withColumn("subreddit_2", f.lower(f.col('subreddit'))) df = df.drop('subreddit') df = df.withColumnRenamed('subreddit_2', 'subreddit') df = df.withColumnRenamed("created_utc", "CreatedAt") df = df.withColumn("Month", f.month(f.col("CreatedAt"))) df = df.withColumn("Year", f.year(f.col("CreatedAt"))) df = df.withColumn("Day", f.dayofmonth(f.col("CreatedAt"))) # sort + repartition by subreddit df_by_subreddit = df.repartition('subreddit') df_by_subreddit = df_by_subreddit.sort(config['subreddit_sort_keys'], ascending=True) df_by_subreddit = df_by_subreddit.sortWithinPartitions(config['subreddit_sort_keys'], ascending=True) df_by_subreddit.write.parquet(config['output_by_subreddit'], mode='overwrite', compression='snappy') # sort + repartition by author df_by_author = df.repartition('author') df_by_author = df_by_author.sort(config['author_sort_keys'], ascending=True) df_by_author = df_by_author.sortWithinPartitions(config['author_sort_keys'], ascending=True) df_by_author.write.parquet(config['output_by_author'], mode='overwrite', compression='snappy') if __name__ == "__main__": fire.Fire(sort_and_write)