# gensim dict from gensim.corpora.dictionary import Dictionary from gensim.models import Nmf host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join( clusters, on="tag", how="inner" ).drop("tag").join( clusters, on="cluster", how="inner" ).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([ pl.col("tag") ]) bow_str = host_bow_clusters["tag"].to_list() dict = Dictionary(bow_str) bow = [dict.doc2bow(x) for x in bow_str] nmf = Nmf(bow, num_topics=10) ## #tf_idf host_names = tf_idf["host"].unique().sort().to_list() n_servers = len(host_names) host_name_lookup = {host_names[i]: i for i in range(n_servers)} n_clusters = tf_idf["cluster"].max() + 1#len(tf_idf.unique("cluster")) id_names = {i: clusters.unique("cluster")["tag"].to_list()[i] for i in range(n_clusters)} m = lil_matrix((n_clusters, n_servers), dtype=int) for row in tf_idf.iter_rows(named=True): m[row["cluster"], host_name_lookup[row["host"]]] = row["count"] dict = Dictionary([host_names]) nmf = Nmf(corpus=m.tocsc(), num_topics=128, id2word=id_names)