1
0

set memory usage.

This commit is contained in:
Nathan TeBlunthuis 2024-12-01 14:55:38 -08:00
parent a31d8b26eb
commit 9911f758f9

View File

@ -247,6 +247,8 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.config('spark.executor.memory','900g')
df = spark.read.parquet(input_parquet)
df = df.repartition(2000,tf_name)
df = df.sort([tf_name,'week','subreddit'])