13
0

add script for pulling cluster timeseries

This commit is contained in:
Nate E TeBlunthuis 2021-05-10 18:24:22 -07:00
parent a013f6718b
commit 47ba04aa97

View File

@ -2,11 +2,11 @@ import pandas as pd
import numpy as np import numpy as np
from pyspark.sql import functions as f from pyspark.sql import functions as f
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from choose_clusters import load_clusters, load_densities from .choose_clusters import load_clusters, load_densities
import fire import fire
from pathlib import Path from pathlib import Path
def main(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather", def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather", author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather",
term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather", term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather",
@ -34,4 +34,4 @@ def main(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_
ts.write.parquet(output, mode='overwrite') ts.write.parquet(output, mode='overwrite')
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire(main) fire.Fire(build_cluster_timeseries)