From 5d70d3eb6d6a4ecde5d5eee5f722ab6c5ab521bd Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 4 Dec 2024 10:43:13 -0800 Subject: [PATCH] improve spark configuration. --- ngrams/term_frequencies.py | 3 +-- ngrams/top_comment_phrases.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index 07ba90e..197dc1f 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -247,8 +247,7 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr from pyspark.sql import functions as f from pyspark.sql import SparkSession - spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate() - spark = SparkSession.builder.config(map={'spark.executor.cores':128}).getOrCreate() + spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate() df = spark.read.parquet(input_parquet) df = df.repartition(2000,tf_name) df = df.sort([tf_name,'week','subreddit']) diff --git a/ngrams/top_comment_phrases.py b/ngrams/top_comment_phrases.py index 00fcc9e..d6807a4 100755 --- a/ngrams/top_comment_phrases.py +++ b/ngrams/top_comment_phrases.py @@ -4,7 +4,7 @@ from pyspark.sql import Window from pyspark.sql import SparkSession import numpy as np -spark = SparkSession.builder.getOrCreate() +spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate() df = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_comment_ngrams_10p_sample/") df2 = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_post_ngrams_10p_sample/") df = df.union(df2)