103 lines
4.3 KiB
Python
103 lines
4.3 KiB
Python
#from federated_design import built_tfidf_matrix, TagData
|
|
import federated_design
|
|
from scipy.sparse.linalg import svds
|
|
from sklearn.preprocessing import normalize
|
|
import polars as pl
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from sklearn.metrics.pairwise import euclidean_distances
|
|
from sklearn.decomposition import PCA
|
|
import json
|
|
import numpy as np
|
|
|
|
class ReccModel:
|
|
def __init__(self, data_dir="data/"):
|
|
self.data_dir = data_dir
|
|
jm = pl.read_json(f"{self.data_dir}joinmastodon-2023-08-25.json")
|
|
jm_servers = set(jm["domain"].unique().to_list())
|
|
self.td = federated_design.TagData(self.data_dir, jm_servers, 256, min_server_accounts=2)
|
|
# Build the tfidf matrix
|
|
self.tfidf = self.td.bm(n_server_accounts=0, n_servers=2, n_accounts=5)
|
|
self.mat = federated_design.built_tfidf_matrix(self.tfidf, self.td.tag_to_index, self.td.host_to_index)
|
|
#self.tag_use_counts = np.sum(self.mat > 0, axis=1).T
|
|
self.tag_use_counts = np.array([self.mat.getrow(i).getnnz() for i in range(self.mat.shape[0])])
|
|
self.has_info = (self.tag_use_counts >= 2).tolist()
|
|
self.tag_names = np.array(list(self.td.tag_to_index.keys()))[self.has_info]
|
|
self.server_has_info = (np.sum(self.mat[self.has_info], axis=0) > 0).tolist()[0]
|
|
self.server_names = np.array(list(self.td.host_to_index.keys()))[self.server_has_info]
|
|
m = self.mat.T.tocsr()[:, self.has_info][self.server_has_info]
|
|
self.m_selected = m#normalize(m, norm="l2", axis=1)
|
|
#self.svd = TruncatedSVD(n_components=50, n_iter=25, random_state=42).fit(self.m_selected)
|
|
def svd(self, k=50, norm_axis=None):
|
|
m = self.m_selected
|
|
if norm_axis is not None:
|
|
m = normalize(m, norm="l2", axis = norm_axis)
|
|
u, s, v = svds(m, k=k, which="LM")
|
|
return u, s, v
|
|
def top_tags(self):
|
|
_, s, v = self.svd(k=25, norm_axis=1)
|
|
tag_stuff = np.diag(s) @ v
|
|
pca = PCA(n_components=2)
|
|
tag_pca = pca.fit_transform(tag_stuff.T)
|
|
return pl.DataFrame({
|
|
"tag": self.tag_names,
|
|
"x": tag_pca[:, 0],
|
|
"y": tag_pca[:, 1],
|
|
"variance": np.var(tag_stuff, axis=0),
|
|
"count": self.tag_use_counts[self.has_info].tolist(),
|
|
"index": np.arange(len(self.tag_names))
|
|
})
|
|
def top_servers(self):
|
|
u, s, v = self.svd(k=25)
|
|
server_stuff = normalize((u @ np.diag(s)).T, norm="l2")
|
|
return pl.DataFrame({
|
|
"server": self.server_names,
|
|
"x": server_stuff[-1],
|
|
"y": server_stuff[-2],
|
|
"index": np.arange(len(self.server_names))
|
|
})
|
|
|
|
# This one seem pretty good!
|
|
def sim_from_tag_index(rm: ReccModel, index=1000):
|
|
u, s, v = rm.svd(k=25, norm_axis=0)
|
|
m = (np.diag(s) @ v).T
|
|
pos = m[index]
|
|
server_matrix = u @ np.diag(s)
|
|
server_sim = cosine_similarity(pos.reshape(1, -1), server_matrix)
|
|
return server_sim
|
|
|
|
if __name__ == "__main__":
|
|
rm = ReccModel(data_dir="data/")
|
|
rm.top_tags().write_ipc("data/scratch/tag_svd.feather")
|
|
rm.top_servers().write_ipc("data/scratch/server_svd.feather")
|
|
u, s, v = rm.svd(k=50, norm_axis=1)
|
|
#pos_m = v.T#(v.T @ np.diag(s))#v.T#
|
|
pos_m = v.T @ np.diag(s)
|
|
server_matrix = u#u @ np.diag(s)#u#
|
|
with open("recommender/data/positions.json", "w") as f:
|
|
f.write(json.dumps(pos_m.tolist()))
|
|
with open("recommender/data/server_matrix.json", "w") as f:
|
|
f.write(json.dumps(server_matrix.tolist()))
|
|
with open("recommender/data/server_names.json", "w") as f:
|
|
f.write(json.dumps(rm.server_names.tolist()))
|
|
with open("recommender/data/tag_names.json", "w") as f:
|
|
f.write(json.dumps(rm.tag_names.tolist()))
|
|
|
|
|
|
top_server_tags_df = rm.tfidf.sort(pl.col("tf_idf"), descending=True).with_columns(pl.lit(1).alias("counter")).with_columns(
|
|
pl.col("counter").cum_sum().over("host").alias("running_count")
|
|
).filter(pl.col("running_count") <= 5).drop("counter", "running_count").select(["host", "tags", "idf", "tf_idf"]).filter(
|
|
pl.col("tf_idf") > 4
|
|
)
|
|
top_server_tags = {}
|
|
for row in top_server_tags_df.iter_rows(named=True):
|
|
if row["host"] not in top_server_tags:
|
|
top_server_tags[row["host"]] = []
|
|
top_server_tags[row["host"]].append(row["tags"])
|
|
with open("recommender/data/server_top_tags.json", "w") as f:
|
|
f.write(json.dumps(top_server_tags))
|
|
|
|
# group by host and add descending value
|
|
#rm.tfidf.sort(pl.col("tf_idf"), descending=True).group_by("host").with_row_index()
|
|
|
|
|
|
#rm.server_names[np.argsort(-cosine_similarity(pos_m[779].reshape(1, -1), server_matrix))].tolist()[0][0:10] |