1
0
This commit is contained in:
Nathan TeBlunthuis 2024-12-19 23:34:55 -08:00
parent 638ab78375
commit a8a92d30df

View File

@ -7,6 +7,7 @@ from functools import partial
def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=None, min_df=None, max_df=None):
#spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet(inpath)
df = df.repartition(128*15, ['subreddit',term_colname])
df = df.filter(~ f.col(term_colname).isin(exclude))