From 5a131053afdd4c881bfad153d0d2d2dd980ecc4b Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Sun, 1 Dec 2024 15:41:47 -0800 Subject: [PATCH] spark config tweaks. --- ngrams/term_frequencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index c410462..4bf5497 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -14,7 +14,6 @@ from nltk.util import ngrams import string from random import random from pathlib import Path -os.environ["_JAVA_OPTIONS"]="-Xmx920g" # remove urls # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url @@ -248,6 +247,7 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr from pyspark.sql import functions as f from pyspark.sql import SparkSession spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate() + spark = SparkSession.builder.config(map={'spark.executor.cores':128}).getOrCreate() df = spark.read.parquet(input_parquet) df = df.repartition(2000,tf_name) df = df.sort([tf_name,'week','subreddit'])