From c59d251d1969c2ffce21642fa9796c70c349543c Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 31 Dec 2024 14:37:50 -0800 Subject: [PATCH] write clusters and read with spark instead of creating data frame. --- timeseries/cluster_timeseries.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py index 2286ab0..0d0eefa 100644 --- a/timeseries/cluster_timeseries.py +++ b/timeseries/cluster_timeseries.py @@ -29,7 +29,8 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit ts = ts.join(spk_densities, on='subreddit', how='inner') clusters = load_clusters(term_clusters_path, author_clusters_path) - spk_clusters = spark.createDataFrame(clusters) + clusters.to_parquet("/tmp/clusters.parquet") + clusters = spark.read.parquet("/tmp/clusters.parquet") ts = ts.join(spk_clusters, on='subreddit', how='inner') ts.write.parquet(output, mode='overwrite')