1
0

write clusters and read with spark instead of creating data frame.

This commit is contained in:
Nathan TeBlunthuis 2024-12-31 14:37:50 -08:00
parent a8a86c2440
commit c59d251d19

View File

@ -29,7 +29,8 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit
ts = ts.join(spk_densities, on='subreddit', how='inner')
clusters = load_clusters(term_clusters_path, author_clusters_path)
spk_clusters = spark.createDataFrame(clusters)
clusters.to_parquet("/tmp/clusters.parquet")
clusters = spark.read.parquet("/tmp/clusters.parquet")
ts = ts.join(spk_clusters, on='subreddit', how='inner')
ts.write.parquet(output, mode='overwrite')