32 lines
1.1 KiB
Python
32 lines
1.1 KiB
Python
# gensim dict
|
|
from gensim.corpora.dictionary import Dictionary
|
|
from gensim.models import Nmf
|
|
|
|
host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join(
|
|
clusters, on="tag", how="inner"
|
|
).drop("tag").join(
|
|
clusters, on="cluster", how="inner"
|
|
).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([
|
|
pl.col("tag")
|
|
])
|
|
|
|
bow_str = host_bow_clusters["tag"].to_list()
|
|
dict = Dictionary(bow_str)
|
|
bow = [dict.doc2bow(x) for x in bow_str]
|
|
|
|
nmf = Nmf(bow, num_topics=10)
|
|
|
|
##
|
|
#tf_idf
|
|
host_names = tf_idf["host"].unique().sort().to_list()
|
|
n_servers = len(host_names)
|
|
host_name_lookup = {host_names[i]: i for i in range(n_servers)}
|
|
n_clusters = tf_idf["cluster"].max() + 1#len(tf_idf.unique("cluster"))
|
|
id_names = {i: clusters.unique("cluster")["tag"].to_list()[i] for i in range(n_clusters)}
|
|
m = lil_matrix((n_clusters, n_servers), dtype=int)
|
|
for row in tf_idf.iter_rows(named=True):
|
|
m[row["cluster"], host_name_lookup[row["host"]]] = row["count"]
|
|
|
|
dict = Dictionary([host_names])
|
|
nmf = Nmf(corpus=m.tocsc(), num_topics=128, id2word=id_names)
|