write clusters and read with spark instead of creating data frame.

2024-12-31 14:37:50 -08:00 · 2024-12-31 14:37:50 -08:00 · c59d251d19
commit c59d251d19
parent a8a86c2440
1 changed files with 2 additions and 1 deletions
--- a/timeseries/cluster_timeseries.py
+++ b/timeseries/cluster_timeseries.py
@ -29,7 +29,8 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit
        ts = ts.join(spk_densities, on='subreddit', how='inner')
    
    clusters = load_clusters(term_clusters_path, author_clusters_path)
-    spk_clusters = spark.createDataFrame(clusters)
+    clusters.to_parquet("/tmp/clusters.parquet")
+    clusters = spark.read.parquet("/tmp/clusters.parquet")
    ts = ts.join(spk_clusters, on='subreddit', how='inner')
    ts.write.parquet(output, mode='overwrite')