spark config tweaks.
This commit is contained in:
parent
224fb89317
commit
5a131053af
@ -14,7 +14,6 @@ from nltk.util import ngrams
|
|||||||
import string
|
import string
|
||||||
from random import random
|
from random import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
os.environ["_JAVA_OPTIONS"]="-Xmx920g"
|
|
||||||
|
|
||||||
# remove urls
|
# remove urls
|
||||||
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
|
# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
|
||||||
@ -248,6 +247,7 @@ def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subr
|
|||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate()
|
spark = SparkSession.builder.config(map={'spark.executor.memory':'900g'}).getOrCreate()
|
||||||
|
spark = SparkSession.builder.config(map={'spark.executor.cores':128}).getOrCreate()
|
||||||
df = spark.read.parquet(input_parquet)
|
df = spark.read.parquet(input_parquet)
|
||||||
df = df.repartition(2000,tf_name)
|
df = df.repartition(2000,tf_name)
|
||||||
df = df.sort([tf_name,'week','subreddit'])
|
df = df.sort([tf_name,'week','subreddit'])
|
||||||
|
Loading…
Reference in New Issue
Block a user