junior-sheer/codebase/scratch/gensim.py


host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join(
  clusters, on="tag", how="inner"
).drop("tag").join(
  cluster_names, on="cluster", how="inner"
).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([
  pl.col("tag")
])

bow_str = host_bow_clusters["tag"].to_list()
dict = Dictionary(bow_str)
bow = [dict.doc2bow(x) for x in bow_str]
lsi_model = LsiModel(bow, id2word=dict, num_topics=100)
lsi_vectors = [lsi_model[doc] for doc in bow]

lsi_model.print_topics()

topic_matrix = lsi_model.get_topics()
topic_matrix_bool = topic_matrix > 0

cluster_names.filter(pl.col("tag") == "vote")
cluster_name_list = cluster_names["tag"].to_list()

topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)

np.array(cluster_names["tag"])[np.argsort(-np.sum(np.abs(topic_matrix), axis=0))]

from scipy.spatial.distance import pdist, squareform
distances = squareform(pdist(np.transpose(topic_matrix), 'euclidean'))

np.clip(topic_matrix, min=0)

def opposite_sum(a):
  return np.sum(np.where(a > 0, a, 0)) - np.sum(np.where(a < 0, a, 0))

def get_information_gain(model, dict, word: str, words_yes: list[str], words_no: list[str]):
  words_yes = words_yes + [word]
  words_no = words_yes #+ [word]
  yes = np.array([x[1] for x in model[dict.doc2bow(words_yes)]])
  no = np.array([x[1] for x in model[dict.doc2bow(words_no)]])
  #if len(words_no) == 1:
  #  return np.sum(opposite_sum(yes))
  if len(words_yes) == 1:
    return np.sum(opposite_sum(no))
  return np.sum(np.abs(yes - no))

get_information_gain(lsi_model, dict, "turkey", [], [])

ig = [get_information_gain(lsi_model, dict, [x]) for x in cluster_name_list]

u_matrix = lsi_model.projection.u  # U matrix (document-topic matrix)
singular_values = lsi_model.projection.s  # Singular values

# Idea: construct two lists: one of selected topics, one of "anti-topics!"
picked_topics = []
unpicked_topics = ["movies", "genealogy", "nfl", "horror", "aiart", "media"]
ig = []
for x in cluster_name_list:
  if (x not in picked_topics + unpicked_topics):
    ig.append(get_information_gain(lsi_model, dict, x, picked_topics, unpicked_topics))
  else:
    ig.append(0)

cluster_name_list[np.argmax(ig)]

from gensim.similarities import Similarity, MatrixSimilarity

my_profile = dict.doc2bow([
  "polars", "fediverse", "mastodon", "quartopub", "influencer", "julia", "introduction",
  "chicago"
])

x = pl.scan_ipc("data/tags-2020-2022.feather").filter(
  pl.col("host") == "elekk.xyz"
).filter(pl.col("acct") == "hoppet").collect()

y = x.explode("tags").with_columns(pl.col("tags").str.to_lowercase()).rename({"tags":"tag"}).join(
  clusters, on="tag", how="inner"
).drop("tag").join(
  cluster_names, on="cluster", how="inner"
).drop("cluster")

my_profile = dict.doc2bow(y["tag"].to_list())

my_vector = [x[1] for x in lsi_model[my_profile]]

def build_host_topic_matrix(v):
  # input is a list of lists where we want to do nested x[1]
  a = []
  for x in v:
    a.append([y[1] for y in x])
  return a

similarities = cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))[0]
sim_df = pl.DataFrame({
  "host": host_bow_clusters["host"],
  "similarity": similarities
}).sort("similarity", descending=True)

#[cosine_similarity(my_vector, x) for x in build_host_topic_matrix(lsi_vectors)]
#cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))

#  topic_matrix_bool = topic_matrix > 0
#  topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
#  cluster_name_list = cluster_names["tag"].to_list()
#  return np.array(cluster_name_list)[np.argsort(-topic_cum_weights)]


#n_components, comp_labels = scipy.sparse.csgraph.connected_components(S, directed=False)
####
#m, tag_index = document_matrix(common_tags["tags"].to_list(), all_tag_posts_filtered["tags"].to_list())

###
#dict = Dictionary(all_tag_posts_filtered["tags"].to_list())
####all_tag_posts_filtered["tags"].to_list()
#bow = [dict.doc2bow(x) for x in all_tag_posts_filtered["tags"].to_list()]#[0:100000]]
#tf_idf_model = TfidfModel(bow)#, dictionary=dict)
#tf_idf = [tf_idf_model[doc] for doc in bow]
#hdp = HdpModel(bow, dict)

###
# tf-idf on m
#from sklearn.feature_extraction.text import TfidfTransformer
#posts_tf_idf = TfidfTransformer().fit_transform(m)
"""
pairs = all_tag_posts_filtered.with_columns(pl.col("tags").map_elements(pairwise_sets).alias("pairs")).explode("pairs").select(pl.col(["host", "acct", "pairs"])).unique()
pairs_counts = pairs.group_by("pairs").count().sort("count", descending=True).with_columns(
  pl.col("pairs").map_elements(lambda x: x.split(",")).alias("pairs")
)
total_pairs = len(pairs)
df = pairs_counts.with_columns(
  pl.col("pairs").list.get(0).alias("first"),
  pl.col("pairs").list.get(1).alias("last")
).drop(["pairs"]).join(
  account_paired_tag_counts.rename({"tags":"first","count":"first_count"}),on="first",how="inner"
).join(
  account_paired_tag_counts.rename({"tags":"last","count":"last_count"}),on="last",how="inner"
)
"""