1
0

correct tf_name

This commit is contained in:
Nathan TeBlunthuis 2024-12-01 14:38:48 -08:00
parent e40cc45d40
commit a31d8b26eb

View File

@ -241,15 +241,16 @@ def weekly_tf(partition,
author_writer.close()
def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subreddit.parquet/",
output_parquet="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/"):
output_parquet="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/",
tf_name='term'):
from pyspark.sql import functions as f
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet(input_parquet)
df = df.repartition(2000,'term')
df = df.sort(['term','week','subreddit'])
df = df.sortWithinPartitions(['term','week','subreddit'])
df = df.repartition(2000,tf_name)
df = df.sort([tf_name,'week','subreddit'])
df = df.sortWithinPartitions([tf_name,'week','subreddit'])
df.write.parquet(output_parquet,mode='overwrite',compression='snappy')
def gen_task_list(mwe_pass='first',