From abe217d2d54d346e06df8fceaec33a2d0f17ae27 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Sun, 1 Dec 2024 15:21:51 -0800 Subject: [PATCH] fix configuration code --- ngrams/term_frequencies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 5a7b166..741e914 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -246,9 +246,10 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr from pyspark.sql import functions as f from pyspark.sql import SparkSession - spark = SparkSession.builder.getOrCreate() - spark.config('spark.executor.memory','900g') + spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate() + +getOrCreate() df = spark.read.parquet(input_parquet) df = df.repartition(2000,tf_name) df = df.sort([tf_name,'week','subreddit'])