52 lines
2.2 KiB
Python
52 lines
2.2 KiB
Python
import polars as pl
|
|
from scipy.sparse import lil_matrix
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
|
|
tf_idf = pl.read_ipc("data/scratch/tag_tfidf.feather")
|
|
common_tags = pl.read_ipc("data/scratch/common_tags.feather")
|
|
|
|
clusters = pl.read_ipc("data/scratch/tag_clusters.feather").join(
|
|
common_tags.rename({"tags":"tag"}), on="tag", how="inner"
|
|
)
|
|
|
|
n_clusters = tf_idf["cluster"].max() + 1
|
|
host_to_index = {host: i for i, host in enumerate(tf_idf["host"].unique().sort().to_list())}
|
|
cluster_names = clusters.sort("count", descending=True).unique("cluster").sort("cluster")["tag"].to_list()
|
|
n_servers = len(host_to_index)
|
|
|
|
m = lil_matrix((n_clusters, n_servers), dtype=int)
|
|
for row in tf_idf.iter_rows(named=True):
|
|
m[row["cluster"], host_to_index[row["host"]]] = row["count"]
|
|
|
|
sim = cosine_similarity(m.tocsr())
|
|
|
|
def find_variety(sim, terms, n=20):
|
|
allowed_index = clusters.filter(pl.col("count") >= 2000)["cluster"].to_list()
|
|
if len(terms) == 0:
|
|
terms = [952, 800]#40, 695, 188, 791]
|
|
# ai, caturday, books, politics
|
|
for i in range(n):
|
|
best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0))))
|
|
terms.append(best_terms[0])
|
|
return terms
|
|
|
|
def find_similar_obscure(sim, selected, n=20):
|
|
allowed_index = clusters.filter(pl.col("count") >= 100)["cluster"].to_list()
|
|
terms = selected
|
|
print(-np.sum(sim[terms], axis=0)[706])
|
|
print(-np.sum(sim[terms], axis=0))
|
|
for i in range(len(selected) + n):
|
|
###best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0))))
|
|
best_terms = list(filter(lambda x: x in allowed_index and x not in terms,np.argsort(-np.sum(sim[terms], axis=0))))
|
|
terms.append(best_terms[0])
|
|
return terms
|
|
|
|
np.array(cluster_names)[find_variety(sim, 25)]
|
|
|
|
np.array(cluster_names)[find_variety_obscure(sim, [337, 1242, 1250], n=10)]
|
|
np.array(cluster_names)[find_variety(sim, [337, 1242, 1250], n=10)]
|
|
|
|
map(lambda x: zip(np.array(cluster_names)[x], x), find_variety_obscure(sim, [337, 1242, 940, 1108, 1454, 612, 260], n=10))
|
|
|
|
np.array(cluster_names)[find_similar_obscure(sim, [337, 1242, 1108, 1454, 612, 260, 424], n=10)] |