correct tf_name
This commit is contained in:
parent
e40cc45d40
commit
a31d8b26eb
@ -241,15 +241,16 @@ def weekly_tf(partition,
|
|||||||
author_writer.close()
|
author_writer.close()
|
||||||
|
|
||||||
def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subreddit.parquet/",
|
def sort_tf(input_parquet="/gscratch/comdata/output/temp_reddit_comments_by_subreddit.parquet/",
|
||||||
output_parquet="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/"):
|
output_parquet="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/",
|
||||||
|
tf_name='term'):
|
||||||
|
|
||||||
from pyspark.sql import functions as f
|
from pyspark.sql import functions as f
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
spark = SparkSession.builder.getOrCreate()
|
spark = SparkSession.builder.getOrCreate()
|
||||||
df = spark.read.parquet(input_parquet)
|
df = spark.read.parquet(input_parquet)
|
||||||
df = df.repartition(2000,'term')
|
df = df.repartition(2000,tf_name)
|
||||||
df = df.sort(['term','week','subreddit'])
|
df = df.sort([tf_name,'week','subreddit'])
|
||||||
df = df.sortWithinPartitions(['term','week','subreddit'])
|
df = df.sortWithinPartitions([tf_name,'week','subreddit'])
|
||||||
df.write.parquet(output_parquet,mode='overwrite',compression='snappy')
|
df.write.parquet(output_parquet,mode='overwrite',compression='snappy')
|
||||||
|
|
||||||
def gen_task_list(mwe_pass='first',
|
def gen_task_list(mwe_pass='first',
|
||||||
|
Loading…
Reference in New Issue
Block a user