diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py index 2286ab0..0d0eefa 100644 --- a/timeseries/cluster_timeseries.py +++ b/timeseries/cluster_timeseries.py @@ -29,7 +29,8 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit ts = ts.join(spk_densities, on='subreddit', how='inner') clusters = load_clusters(term_clusters_path, author_clusters_path) - spk_clusters = spark.createDataFrame(clusters) + clusters.to_parquet("/tmp/clusters.parquet") + clusters = spark.read.parquet("/tmp/clusters.parquet") ts = ts.join(spk_clusters, on='subreddit', how='inner') ts.write.parquet(output, mode='overwrite')