from federated_design import * if __name__ == '__main__': jm = pl.read_json("data/joinmastodon-2023-08-25.json") jm_servers = set(jm["domain"].unique().to_list()) jm_td = TagData(jm_servers, 256, min_server_accounts=2) jm_tfidf = jm_td.bm(n_server_accounts=0, n_servers=2, n_accounts=10) mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index) m = (mat.T / (scipy.sparse.linalg.norm(mat.T, ord=2, axis=0) + 0.0001)) server_similarlity = cosine_similarity(m.tocsr()) l = [] for i in range(np.shape(server_similarlity)[0] - 1): #s_index = min(i, np.shape(baseline_similarlity)[0] - 1) l.append( pl.DataFrame({ "Source": list(jm_td.host_to_index.keys())[i], "Target": list(jm_td.host_to_index.keys())[i+1:], "Similarity": server_similarlity[i][i+1:] }) ) similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0) jm = pl.read_json("data/joinmastodon-2023-08-25.json") server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list()) jm_servers = set(jm["domain"].unique().to_list()) jm_td = TagData(server_samples, 256, min_server_accounts=2) jm_tfidf = jm_td.bm(n_server_accounts=0, n_servers=2, n_accounts=10) mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index) m = (mat.T / (scipy.sparse.linalg.norm(mat.T, ord=2, axis=0) + 0.0001)) server_similarlity = cosine_similarity(m.tocsr()) #has_info = np.array((np.sum(mat, axis=1).T > 0).tolist()[0]) tag_use_counts = np.sum(mat > 0, axis=1).T has_info = (tag_use_counts >= 3).tolist()[0] tag_names = np.array(list(jm_td.tag_to_index.keys()))[has_info] m_selected = m.tocsr()[:, has_info] tag_sm = cosine_similarity(m_selected.T) from sklearn.cluster import AffinityPropagation ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm) clusters = pl.DataFrame({"tag": tag_names, "cluster": ap.labels_, "servers": tag_use_counts[[has_info]].tolist()[0]}) clusters.sort("servers", descending=True).unique("cluster") tag_index_included = (np.sum(tag_sm, axis=0) > 0) included_tag_strings = np.array(list(jm_td.tag_to_index.keys()))[tag_index_included] tag_sm_matrix = tag_sm[np.ix_(tag_index_included, tag_index_included)] # import Affinity Prop from sklearn.cluster import AffinityPropagation ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm_matrix) clusters = pl.DataFrame({"tag": included_tag_strings, "cluster": ap.labels_}) # select a random element from each cluster clusters.group_by("cluster").agg([pl.col("tag").shuffle().first().alias("tag")]).sort("cluster")["tag"].to_list() clusters.group_by("cluster").agg([pl.col("tag").len().alias("count")]).sort("count", descending=True) clusters.filter(pl.col("servers") >= 10)