improve spark configuration.
This commit is contained in:
parent
89d03dd956
commit
5d70d3eb6d
@ -247,8 +247,7 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr
|
|||||||
|
|
||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate()
|
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
|
||||||
spark = SparkSession.builder.config(map={'spark.executor.cores':128}).getOrCreate()
|
|
||||||
df = spark.read.parquet(input_parquet)
|
df = spark.read.parquet(input_parquet)
|
||||||
df = df.repartition(2000,tf_name)
|
df = df.repartition(2000,tf_name)
|
||||||
df = df.sort([tf_name,'week','subreddit'])
|
df = df.sort([tf_name,'week','subreddit'])
|
||||||
|
@ -4,7 +4,7 @@ from pyspark.sql import Window
|
|||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
|
||||||
df = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_comment_ngrams_10p_sample/")
|
df = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_comment_ngrams_10p_sample/")
|
||||||
df2 = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_post_ngrams_10p_sample/")
|
df2 = spark.read.text("/gscratch/comdata/output/reddit_ngrams/reddit_post_ngrams_10p_sample/")
|
||||||
df = df.union(df2)
|
df = df.union(df2)
|
||||||
|
Loading…
Reference in New Issue
Block a user