from federated_design import * from scipy.sparse.linalg import svds from sklearn.preprocessing import normalize import polars as pl from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import euclidean_distances from sklearn.decomposition import PCA import json class ReccModel: def __init__(self): jm = pl.read_json("data/joinmastodon-2023-08-25.json") jm_servers = set(jm["domain"].unique().to_list()) self.td = TagData(jm_servers, 256, min_server_accounts=2) # Build the tfidf matrix self.tfidf = self.td.bm(n_server_accounts=0, n_servers=2, n_accounts=5) self.mat = built_tfidf_matrix(self.tfidf, self.td.tag_to_index, self.td.host_to_index) #self.tag_use_counts = np.sum(self.mat > 0, axis=1).T self.tag_use_counts = np.array([self.mat.getrow(i).getnnz() for i in range(self.mat.shape[0])]) self.has_info = (self.tag_use_counts >= 2).tolist() self.tag_names = np.array(list(self.td.tag_to_index.keys()))[self.has_info] self.server_has_info = (np.sum(self.mat[self.has_info], axis=0) > 0).tolist()[0] self.server_names = np.array(list(self.td.host_to_index.keys()))[self.server_has_info] self.m_selected = normalize(self.mat.T.tocsr()[:, self.has_info][self.server_has_info], norm="l2", axis=1) #self.svd = TruncatedSVD(n_components=50, n_iter=25, random_state=42).fit(self.m_selected) def svd(self, k=50, norm_axis=None): m = self.m_selected if norm_axis is not None: m = normalize(m, norm="l2", axis = norm_axis) u, s, v = svds(self.m_selected, k=k, which="LM") return u, s, v def top_tags(self): u, s, v = self.svd(k=25) tag_stuff = np.diag(s) @ v pca = PCA(n_components=2) tag_pca = pca.fit_transform(tag_stuff.T) print(tag_pca[:, 0]) return pl.DataFrame({ "tag": self.tag_names, "x": tag_pca[:, 0], "y": tag_pca[:, 1], "variance": np.var(tag_stuff, axis=0), "count": self.tag_use_counts[self.has_info].tolist(), "index": np.arange(len(self.tag_names)) }) def top_servers(self): u, s, v = self.svd(k=25) server_stuff = normalize((u @ np.diag(s)).T, norm="l2") return pl.DataFrame({ "server": self.server_names, "x": server_stuff[-1], "y": server_stuff[-2], "index": np.arange(len(self.server_names)) }) # This one seem pretty good! def sim_from_tag_index(rm: ReccModel, index=1000): u, s, v = rm.svd(k=25, norm_axis=0) m = (np.diag(s) @ v).T pos = m[index] server_matrix = u @ np.diag(s) server_sim = cosine_similarity(pos.reshape(1, -1), server_matrix) return server_sim if __name__ == "__main__": rm = ReccModel() rm.top_tags().write_ipc("data/scratch/tag_svd.feather") rm.top_servers().write_ipc("data/scratch/server_svd.feather") u, s, v = rm.svd(k=50, norm_axis=None) #pos_m = v.T#(v.T @ np.diag(s))#v.T# pos_m = v.T @ np.diag(s) server_matrix = u#u @ np.diag(s)#u# with open("recommender/data/positions.json", "w") as f: f.write(json.dumps(pos_m.tolist())) with open("recommender/data/server_matrix.json", "w") as f: f.write(json.dumps(server_matrix.tolist())) with open("recommender/data/server_names.json", "w") as f: f.write(json.dumps(rm.server_names.tolist())) with open("recommender/data/tag_names.json", "w") as f: f.write(json.dumps(rm.tag_names.tolist())) top_server_tags_df = rm.tfidf.sort(pl.col("tf_idf"), descending=True).with_columns(pl.lit(1).alias("counter")).with_columns( pl.col("counter").cum_sum().over("host").alias("running_count") ).filter(pl.col("running_count") <= 5).drop("counter", "running_count").select(["host", "tags", "idf", "tf_idf"]).filter( pl.col("tf_idf") > 4 ) top_server_tags = {} for row in top_server_tags_df.iter_rows(named=True): if row["host"] not in top_server_tags: top_server_tags[row["host"]] = [] top_server_tags[row["host"]].append(row["tags"]) with open("recommender/data/server_top_tags.json", "w") as f: f.write(json.dumps(top_server_tags)) # group by host and add descending value #rm.tfidf.sort(pl.col("tf_idf"), descending=True).group_by("host").with_row_index() #rm.server_names[np.argsort(-cosine_similarity(pos_m[779].reshape(1, -1), server_matrix))].tolist()[0][0:10]