junior-sheer/codebase/old/hclust.py

import polars as pl
from scipy.sparse import lil_matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

tf_idf = pl.read_ipc("data/scratch/tag_tfidf.feather")
common_tags = pl.read_ipc("data/scratch/common_tags.feather")

clusters = pl.read_ipc("data/scratch/tag_clusters.feather").join(
  common_tags.rename({"tags":"tag"}), on="tag", how="inner"
)

n_clusters = tf_idf["cluster"].max() + 1
host_to_index = {host: i for i, host in enumerate(tf_idf["host"].unique().sort().to_list())}
cluster_names = clusters.sort("count", descending=True).unique("cluster").sort("cluster")["tag"].to_list()
n_servers = len(host_to_index)

m = lil_matrix((n_clusters, n_servers), dtype=int)
for row in tf_idf.iter_rows(named=True):
  m[row["cluster"], host_to_index[row["host"]]] = row["count"]

sim = cosine_similarity(m.tocsr())

def find_variety(sim, terms, n=20):
  allowed_index = clusters.filter(pl.col("count") >= 2000)["cluster"].to_list()
  if len(terms) == 0:
    terms = [952, 800]#40, 695, 188, 791]
  # ai, caturday, books, politics
  for i in range(n):
    best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0))))
    terms.append(best_terms[0])
  return terms

def find_similar_obscure(sim, selected, n=20):
  allowed_index = clusters.filter(pl.col("count") >= 100)["cluster"].to_list()
  terms = selected
  print(-np.sum(sim[terms], axis=0)[706])
  print(-np.sum(sim[terms], axis=0))
  for i in range(len(selected) + n):
    ###best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0))))
    best_terms = list(filter(lambda x: x in allowed_index and x not in terms,np.argsort(-np.sum(sim[terms], axis=0))))
    terms.append(best_terms[0])
  return terms

np.array(cluster_names)[find_variety(sim, 25)]

np.array(cluster_names)[find_variety_obscure(sim, [337, 1242, 1250], n=10)]
np.array(cluster_names)[find_variety(sim, [337, 1242, 1250], n=10)]

map(lambda x: zip(np.array(cluster_names)[x], x), find_variety_obscure(sim, [337, 1242, 940, 1108, 1454, 612, 260], n=10))

np.array(cluster_names)[find_similar_obscure(sim, [337, 1242, 1108, 1454, 612, 260, 424], n=10)]