set memory usage.
This commit is contained in:
parent
a31d8b26eb
commit
9911f758f9
@ -247,6 +247,8 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr
|
|||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
spark.config('spark.executor.memory','900g')
|
||||||
|
|
||||||
df = spark.read.parquet(input_parquet)
|
df = spark.read.parquet(input_parquet)
|
||||||
df = df.repartition(2000,tf_name)
|
df = df.repartition(2000,tf_name)
|
||||||
df = df.sort([tf_name,'week','subreddit'])
|
df = df.sort([tf_name,'week','subreddit'])
|
||||||
|
Loading…
Reference in New Issue
Block a user