diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 2cfc077..9617ddd 100755 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -8,7 +8,7 @@ from functools import partial def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=None, min_df=None, max_df=None): spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate() df = spark.read.parquet(inpath) - df = df.repartition(1280*15, cols=['subreddit',term_colname]) + df = df.repartition(128*15, cols=['subreddit',term_colname]) df = df.filter(~ f.col(term_colname).isin(exclude)) if included_subreddits is not None: