junior-sheer/codebase/scratch/gensim.py
Carl Colglazier 60023b07d1 Refactor scripts and code.
Split manuscripts into their own directories / projects.
2025-05-26 20:08:57 -05:00

139 lines
5.0 KiB
Python

host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join(
clusters, on="tag", how="inner"
).drop("tag").join(
cluster_names, on="cluster", how="inner"
).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([
pl.col("tag")
])
bow_str = host_bow_clusters["tag"].to_list()
dict = Dictionary(bow_str)
bow = [dict.doc2bow(x) for x in bow_str]
lsi_model = LsiModel(bow, id2word=dict, num_topics=100)
lsi_vectors = [lsi_model[doc] for doc in bow]
lsi_model.print_topics()
topic_matrix = lsi_model.get_topics()
topic_matrix_bool = topic_matrix > 0
cluster_names.filter(pl.col("tag") == "vote")
cluster_name_list = cluster_names["tag"].to_list()
topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
np.array(cluster_names["tag"])[np.argsort(-np.sum(np.abs(topic_matrix), axis=0))]
from scipy.spatial.distance import pdist, squareform
distances = squareform(pdist(np.transpose(topic_matrix), 'euclidean'))
np.clip(topic_matrix, min=0)
def opposite_sum(a):
return np.sum(np.where(a > 0, a, 0)) - np.sum(np.where(a < 0, a, 0))
def get_information_gain(model, dict, word: str, words_yes: list[str], words_no: list[str]):
words_yes = words_yes + [word]
words_no = words_yes #+ [word]
yes = np.array([x[1] for x in model[dict.doc2bow(words_yes)]])
no = np.array([x[1] for x in model[dict.doc2bow(words_no)]])
#if len(words_no) == 1:
# return np.sum(opposite_sum(yes))
if len(words_yes) == 1:
return np.sum(opposite_sum(no))
return np.sum(np.abs(yes - no))
get_information_gain(lsi_model, dict, "turkey", [], [])
ig = [get_information_gain(lsi_model, dict, [x]) for x in cluster_name_list]
u_matrix = lsi_model.projection.u # U matrix (document-topic matrix)
singular_values = lsi_model.projection.s # Singular values
# Idea: construct two lists: one of selected topics, one of "anti-topics!"
picked_topics = []
unpicked_topics = ["movies", "genealogy", "nfl", "horror", "aiart", "media"]
ig = []
for x in cluster_name_list:
if (x not in picked_topics + unpicked_topics):
ig.append(get_information_gain(lsi_model, dict, x, picked_topics, unpicked_topics))
else:
ig.append(0)
cluster_name_list[np.argmax(ig)]
from gensim.similarities import Similarity, MatrixSimilarity
my_profile = dict.doc2bow([
"polars", "fediverse", "mastodon", "quartopub", "influencer", "julia", "introduction",
"chicago"
])
x = pl.scan_ipc("data/tags-2020-2022.feather").filter(
pl.col("host") == "elekk.xyz"
).filter(pl.col("acct") == "hoppet").collect()
y = x.explode("tags").with_columns(pl.col("tags").str.to_lowercase()).rename({"tags":"tag"}).join(
clusters, on="tag", how="inner"
).drop("tag").join(
cluster_names, on="cluster", how="inner"
).drop("cluster")
my_profile = dict.doc2bow(y["tag"].to_list())
my_vector = [x[1] for x in lsi_model[my_profile]]
def build_host_topic_matrix(v):
# input is a list of lists where we want to do nested x[1]
a = []
for x in v:
a.append([y[1] for y in x])
return a
similarities = cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))[0]
sim_df = pl.DataFrame({
"host": host_bow_clusters["host"],
"similarity": similarities
}).sort("similarity", descending=True)
#[cosine_similarity(my_vector, x) for x in build_host_topic_matrix(lsi_vectors)]
#cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))
# topic_matrix_bool = topic_matrix > 0
# topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
# cluster_name_list = cluster_names["tag"].to_list()
# return np.array(cluster_name_list)[np.argsort(-topic_cum_weights)]
#n_components, comp_labels = scipy.sparse.csgraph.connected_components(S, directed=False)
####
#m, tag_index = document_matrix(common_tags["tags"].to_list(), all_tag_posts_filtered["tags"].to_list())
###
#dict = Dictionary(all_tag_posts_filtered["tags"].to_list())
####all_tag_posts_filtered["tags"].to_list()
#bow = [dict.doc2bow(x) for x in all_tag_posts_filtered["tags"].to_list()]#[0:100000]]
#tf_idf_model = TfidfModel(bow)#, dictionary=dict)
#tf_idf = [tf_idf_model[doc] for doc in bow]
#hdp = HdpModel(bow, dict)
###
# tf-idf on m
#from sklearn.feature_extraction.text import TfidfTransformer
#posts_tf_idf = TfidfTransformer().fit_transform(m)
"""
pairs = all_tag_posts_filtered.with_columns(pl.col("tags").map_elements(pairwise_sets).alias("pairs")).explode("pairs").select(pl.col(["host", "acct", "pairs"])).unique()
pairs_counts = pairs.group_by("pairs").count().sort("count", descending=True).with_columns(
pl.col("pairs").map_elements(lambda x: x.split(",")).alias("pairs")
)
total_pairs = len(pairs)
df = pairs_counts.with_columns(
pl.col("pairs").list.get(0).alias("first"),
pl.col("pairs").list.get(1).alias("last")
).drop(["pairs"]).join(
account_paired_tag_counts.rename({"tags":"first","count":"first_count"}),on="first",how="inner"
).join(
account_paired_tag_counts.rename({"tags":"last","count":"last_count"}),on="last",how="inner"
)
"""