diff --git a/ngrams/term_frequencies.py b/ngrams/term_frequencies.py index b83f55d..5a7b166 100755 --- a/ngrams/term_frequencies.py +++ b/ngrams/term_frequencies.py @@ -247,6 +247,8 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr from pyspark.sql import functions as f from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() + spark.config('spark.executor.memory','900g') + df = spark.read.parquet(input_parquet) df = df.repartition(2000,tf_name) df = df.sort([tf_name,'week','subreddit'])