1
0

improve spark configuration.

This commit is contained in:
Nathan TeBlunthuis 2024-12-04 10:43:13 -08:00
parent 89d03dd956
commit 5d70d3eb6d
2 changed files with 2 additions and 3 deletions

View File

@ -247,8 +247,7 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr
from pyspark.sql import functions as f from pyspark.sql import functions as f
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate() spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
spark = SparkSession.builder.config(map={'spark.executor.cores':128}).getOrCreate()
df = spark.read.parquet(input_parquet) df = spark.read.parquet(input_parquet)
df = df.repartition(2000,tf_name) df = df.repartition(2000,tf_name)
df = df.sort([tf_name,'week','subreddit']) df = df.sort([tf_name,'week','subreddit'])

View File

@ -4,7 +4,7 @@ from pyspark.sql import Window
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
import numpy as np import numpy as np
spark = SparkSession.builder.getOrCreate() spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
df = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_comment_ngrams_10p_sample/") df = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_comment_ngrams_10p_sample/")
df2 = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_post_ngrams_10p_sample/") df2 = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_post_ngrams_10p_sample/")
df = df.union(df2) df = df.union(df2)