improve spark configuration.
This commit is contained in:
parent
89d03dd956
commit
5d70d3eb6d
@ -247,8 +247,7 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr
|
||||
|
||||
from pyspark.sql import functions as f
|
||||
from pyspark.sql import SparkSession
|
||||
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate()
|
||||
spark = SparkSession.builder.config(map={'spark.executor.cores':128}).getOrCreate()
|
||||
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
|
||||
df = spark.read.parquet(input_parquet)
|
||||
df = df.repartition(2000,tf_name)
|
||||
df = df.sort([tf_name,'week','subreddit'])
|
||||
|
@ -4,7 +4,7 @@ from pyspark.sql import Window
|
||||
from pyspark.sql import SparkSession
|
||||
import numpy as np
|
||||
|
||||
spark = SparkSession.builder.getOrCreate()
|
||||
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
|
||||
df = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_comment_ngrams_10p_sample/")
|
||||
df2 = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_post_ngrams_10p_sample/")
|
||||
df = df.union(df2)
|
||||
|
Loading…
Reference in New Issue
Block a user