diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py index 0d0eefa..7ffcf0f 100644 --- a/timeseries/cluster_timeseries.py +++ b/timeseries/cluster_timeseries.py @@ -30,7 +30,7 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit clusters = load_clusters(term_clusters_path, author_clusters_path) clusters.to_parquet("/tmp/clusters.parquet") - clusters = spark.read.parquet("/tmp/clusters.parquet") + spk_clusters = spark.read.parquet("/tmp/clusters.parquet") ts = ts.join(spk_clusters, on='subreddit', how='inner') ts.write.parquet(output, mode='overwrite')