139 lines
5.0 KiB
Python
139 lines
5.0 KiB
Python
|
|
host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join(
|
|
clusters, on="tag", how="inner"
|
|
).drop("tag").join(
|
|
cluster_names, on="cluster", how="inner"
|
|
).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([
|
|
pl.col("tag")
|
|
])
|
|
|
|
bow_str = host_bow_clusters["tag"].to_list()
|
|
dict = Dictionary(bow_str)
|
|
bow = [dict.doc2bow(x) for x in bow_str]
|
|
lsi_model = LsiModel(bow, id2word=dict, num_topics=100)
|
|
lsi_vectors = [lsi_model[doc] for doc in bow]
|
|
|
|
lsi_model.print_topics()
|
|
|
|
topic_matrix = lsi_model.get_topics()
|
|
topic_matrix_bool = topic_matrix > 0
|
|
|
|
cluster_names.filter(pl.col("tag") == "vote")
|
|
cluster_name_list = cluster_names["tag"].to_list()
|
|
|
|
topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
|
|
|
|
np.array(cluster_names["tag"])[np.argsort(-np.sum(np.abs(topic_matrix), axis=0))]
|
|
|
|
from scipy.spatial.distance import pdist, squareform
|
|
distances = squareform(pdist(np.transpose(topic_matrix), 'euclidean'))
|
|
|
|
np.clip(topic_matrix, min=0)
|
|
|
|
def opposite_sum(a):
|
|
return np.sum(np.where(a > 0, a, 0)) - np.sum(np.where(a < 0, a, 0))
|
|
|
|
def get_information_gain(model, dict, word: str, words_yes: list[str], words_no: list[str]):
|
|
words_yes = words_yes + [word]
|
|
words_no = words_yes #+ [word]
|
|
yes = np.array([x[1] for x in model[dict.doc2bow(words_yes)]])
|
|
no = np.array([x[1] for x in model[dict.doc2bow(words_no)]])
|
|
#if len(words_no) == 1:
|
|
# return np.sum(opposite_sum(yes))
|
|
if len(words_yes) == 1:
|
|
return np.sum(opposite_sum(no))
|
|
return np.sum(np.abs(yes - no))
|
|
|
|
get_information_gain(lsi_model, dict, "turkey", [], [])
|
|
|
|
ig = [get_information_gain(lsi_model, dict, [x]) for x in cluster_name_list]
|
|
|
|
u_matrix = lsi_model.projection.u # U matrix (document-topic matrix)
|
|
singular_values = lsi_model.projection.s # Singular values
|
|
|
|
# Idea: construct two lists: one of selected topics, one of "anti-topics!"
|
|
picked_topics = []
|
|
unpicked_topics = ["movies", "genealogy", "nfl", "horror", "aiart", "media"]
|
|
ig = []
|
|
for x in cluster_name_list:
|
|
if (x not in picked_topics + unpicked_topics):
|
|
ig.append(get_information_gain(lsi_model, dict, x, picked_topics, unpicked_topics))
|
|
else:
|
|
ig.append(0)
|
|
|
|
cluster_name_list[np.argmax(ig)]
|
|
|
|
from gensim.similarities import Similarity, MatrixSimilarity
|
|
|
|
my_profile = dict.doc2bow([
|
|
"polars", "fediverse", "mastodon", "quartopub", "influencer", "julia", "introduction",
|
|
"chicago"
|
|
])
|
|
|
|
x = pl.scan_ipc("data/tags-2020-2022.feather").filter(
|
|
pl.col("host") == "elekk.xyz"
|
|
).filter(pl.col("acct") == "hoppet").collect()
|
|
|
|
y = x.explode("tags").with_columns(pl.col("tags").str.to_lowercase()).rename({"tags":"tag"}).join(
|
|
clusters, on="tag", how="inner"
|
|
).drop("tag").join(
|
|
cluster_names, on="cluster", how="inner"
|
|
).drop("cluster")
|
|
|
|
my_profile = dict.doc2bow(y["tag"].to_list())
|
|
|
|
my_vector = [x[1] for x in lsi_model[my_profile]]
|
|
|
|
def build_host_topic_matrix(v):
|
|
# input is a list of lists where we want to do nested x[1]
|
|
a = []
|
|
for x in v:
|
|
a.append([y[1] for y in x])
|
|
return a
|
|
|
|
similarities = cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))[0]
|
|
sim_df = pl.DataFrame({
|
|
"host": host_bow_clusters["host"],
|
|
"similarity": similarities
|
|
}).sort("similarity", descending=True)
|
|
|
|
#[cosine_similarity(my_vector, x) for x in build_host_topic_matrix(lsi_vectors)]
|
|
#cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))
|
|
|
|
# topic_matrix_bool = topic_matrix > 0
|
|
# topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
|
|
# cluster_name_list = cluster_names["tag"].to_list()
|
|
# return np.array(cluster_name_list)[np.argsort(-topic_cum_weights)]
|
|
|
|
|
|
#n_components, comp_labels = scipy.sparse.csgraph.connected_components(S, directed=False)
|
|
####
|
|
#m, tag_index = document_matrix(common_tags["tags"].to_list(), all_tag_posts_filtered["tags"].to_list())
|
|
|
|
###
|
|
#dict = Dictionary(all_tag_posts_filtered["tags"].to_list())
|
|
####all_tag_posts_filtered["tags"].to_list()
|
|
#bow = [dict.doc2bow(x) for x in all_tag_posts_filtered["tags"].to_list()]#[0:100000]]
|
|
#tf_idf_model = TfidfModel(bow)#, dictionary=dict)
|
|
#tf_idf = [tf_idf_model[doc] for doc in bow]
|
|
#hdp = HdpModel(bow, dict)
|
|
|
|
###
|
|
# tf-idf on m
|
|
#from sklearn.feature_extraction.text import TfidfTransformer
|
|
#posts_tf_idf = TfidfTransformer().fit_transform(m)
|
|
"""
|
|
pairs = all_tag_posts_filtered.with_columns(pl.col("tags").map_elements(pairwise_sets).alias("pairs")).explode("pairs").select(pl.col(["host", "acct", "pairs"])).unique()
|
|
pairs_counts = pairs.group_by("pairs").count().sort("count", descending=True).with_columns(
|
|
pl.col("pairs").map_elements(lambda x: x.split(",")).alias("pairs")
|
|
)
|
|
total_pairs = len(pairs)
|
|
df = pairs_counts.with_columns(
|
|
pl.col("pairs").list.get(0).alias("first"),
|
|
pl.col("pairs").list.get(1).alias("last")
|
|
).drop(["pairs"]).join(
|
|
account_paired_tag_counts.rename({"tags":"first","count":"first_count"}),on="first",how="inner"
|
|
).join(
|
|
account_paired_tag_counts.rename({"tags":"last","count":"last_count"}),on="last",how="inner"
|
|
)
|
|
""" |