import polars as pl from scipy.sparse import lil_matrix from sklearn.metrics.pairwise import cosine_similarity import numpy as np tf_idf = pl.read_ipc("data/scratch/tag_tfidf.feather") common_tags = pl.read_ipc("data/scratch/common_tags.feather") clusters = pl.read_ipc("data/scratch/tag_clusters.feather").join( common_tags.rename({"tags":"tag"}), on="tag", how="inner" ) n_clusters = tf_idf["cluster"].max() + 1 host_to_index = {host: i for i, host in enumerate(tf_idf["host"].unique().sort().to_list())} cluster_names = clusters.sort("count", descending=True).unique("cluster").sort("cluster")["tag"].to_list() n_servers = len(host_to_index) m = lil_matrix((n_clusters, n_servers), dtype=int) for row in tf_idf.iter_rows(named=True): m[row["cluster"], host_to_index[row["host"]]] = row["count"] sim = cosine_similarity(m.tocsr()) def find_variety(sim, terms, n=20): allowed_index = clusters.filter(pl.col("count") >= 2000)["cluster"].to_list() if len(terms) == 0: terms = [952, 800]#40, 695, 188, 791] # ai, caturday, books, politics for i in range(n): best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0)))) terms.append(best_terms[0]) return terms def find_similar_obscure(sim, selected, n=20): allowed_index = clusters.filter(pl.col("count") >= 100)["cluster"].to_list() terms = selected print(-np.sum(sim[terms], axis=0)[706]) print(-np.sum(sim[terms], axis=0)) for i in range(len(selected) + n): ###best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0)))) best_terms = list(filter(lambda x: x in allowed_index and x not in terms,np.argsort(-np.sum(sim[terms], axis=0)))) terms.append(best_terms[0]) return terms np.array(cluster_names)[find_variety(sim, 25)] np.array(cluster_names)[find_variety_obscure(sim, [337, 1242, 1250], n=10)] np.array(cluster_names)[find_variety(sim, [337, 1242, 1250], n=10)] map(lambda x: zip(np.array(cluster_names)[x], x), find_variety_obscure(sim, [337, 1242, 940, 1108, 1454, 612, 260], n=10)) np.array(cluster_names)[find_similar_obscure(sim, [337, 1242, 1108, 1454, 612, 260, 424], n=10)]