host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join( clusters, on="tag", how="inner" ).drop("tag").join( cluster_names, on="cluster", how="inner" ).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([ pl.col("tag") ]) bow_str = host_bow_clusters["tag"].to_list() dict = Dictionary(bow_str) bow = [dict.doc2bow(x) for x in bow_str] lsi_model = LsiModel(bow, id2word=dict, num_topics=100) lsi_vectors = [lsi_model[doc] for doc in bow] lsi_model.print_topics() topic_matrix = lsi_model.get_topics() topic_matrix_bool = topic_matrix > 0 cluster_names.filter(pl.col("tag") == "vote") cluster_name_list = cluster_names["tag"].to_list() topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0) np.array(cluster_names["tag"])[np.argsort(-np.sum(np.abs(topic_matrix), axis=0))] from scipy.spatial.distance import pdist, squareform distances = squareform(pdist(np.transpose(topic_matrix), 'euclidean')) np.clip(topic_matrix, min=0) def opposite_sum(a): return np.sum(np.where(a > 0, a, 0)) - np.sum(np.where(a < 0, a, 0)) def get_information_gain(model, dict, word: str, words_yes: list[str], words_no: list[str]): words_yes = words_yes + [word] words_no = words_yes #+ [word] yes = np.array([x[1] for x in model[dict.doc2bow(words_yes)]]) no = np.array([x[1] for x in model[dict.doc2bow(words_no)]]) #if len(words_no) == 1: # return np.sum(opposite_sum(yes)) if len(words_yes) == 1: return np.sum(opposite_sum(no)) return np.sum(np.abs(yes - no)) get_information_gain(lsi_model, dict, "turkey", [], []) ig = [get_information_gain(lsi_model, dict, [x]) for x in cluster_name_list] u_matrix = lsi_model.projection.u # U matrix (document-topic matrix) singular_values = lsi_model.projection.s # Singular values # Idea: construct two lists: one of selected topics, one of "anti-topics!" picked_topics = [] unpicked_topics = ["movies", "genealogy", "nfl", "horror", "aiart", "media"] ig = [] for x in cluster_name_list: if (x not in picked_topics + unpicked_topics): ig.append(get_information_gain(lsi_model, dict, x, picked_topics, unpicked_topics)) else: ig.append(0) cluster_name_list[np.argmax(ig)] from gensim.similarities import Similarity, MatrixSimilarity my_profile = dict.doc2bow([ "polars", "fediverse", "mastodon", "quartopub", "influencer", "julia", "introduction", "chicago" ]) x = pl.scan_ipc("data/tags-2020-2022.feather").filter( pl.col("host") == "elekk.xyz" ).filter(pl.col("acct") == "hoppet").collect() y = x.explode("tags").with_columns(pl.col("tags").str.to_lowercase()).rename({"tags":"tag"}).join( clusters, on="tag", how="inner" ).drop("tag").join( cluster_names, on="cluster", how="inner" ).drop("cluster") my_profile = dict.doc2bow(y["tag"].to_list()) my_vector = [x[1] for x in lsi_model[my_profile]] def build_host_topic_matrix(v): # input is a list of lists where we want to do nested x[1] a = [] for x in v: a.append([y[1] for y in x]) return a similarities = cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))[0] sim_df = pl.DataFrame({ "host": host_bow_clusters["host"], "similarity": similarities }).sort("similarity", descending=True) #[cosine_similarity(my_vector, x) for x in build_host_topic_matrix(lsi_vectors)] #cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors)) # topic_matrix_bool = topic_matrix > 0 # topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0) # cluster_name_list = cluster_names["tag"].to_list() # return np.array(cluster_name_list)[np.argsort(-topic_cum_weights)] #n_components, comp_labels = scipy.sparse.csgraph.connected_components(S, directed=False) #### #m, tag_index = document_matrix(common_tags["tags"].to_list(), all_tag_posts_filtered["tags"].to_list()) ### #dict = Dictionary(all_tag_posts_filtered["tags"].to_list()) ####all_tag_posts_filtered["tags"].to_list() #bow = [dict.doc2bow(x) for x in all_tag_posts_filtered["tags"].to_list()]#[0:100000]] #tf_idf_model = TfidfModel(bow)#, dictionary=dict) #tf_idf = [tf_idf_model[doc] for doc in bow] #hdp = HdpModel(bow, dict) ### # tf-idf on m #from sklearn.feature_extraction.text import TfidfTransformer #posts_tf_idf = TfidfTransformer().fit_transform(m) """ pairs = all_tag_posts_filtered.with_columns(pl.col("tags").map_elements(pairwise_sets).alias("pairs")).explode("pairs").select(pl.col(["host", "acct", "pairs"])).unique() pairs_counts = pairs.group_by("pairs").count().sort("count", descending=True).with_columns( pl.col("pairs").map_elements(lambda x: x.split(",")).alias("pairs") ) total_pairs = len(pairs) df = pairs_counts.with_columns( pl.col("pairs").list.get(0).alias("first"), pl.col("pairs").list.get(1).alias("last") ).drop(["pairs"]).join( account_paired_tag_counts.rename({"tags":"first","count":"first_count"}),on="first",how="inner" ).join( account_paired_tag_counts.rename({"tags":"last","count":"last_count"}),on="last",how="inner" ) """