import polars as pl from scipy.sparse import lil_matrix from sklearn.metrics.pairwise import cosine_similarity import numpy as np import textdistance from scipy.stats import kendalltau import rbo def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix: #tag_to_index = {tag: i for i, tag in enumerate(tfidf["tags"].unique().sort().to_list())} n_tags = len(tag_to_index) #host_to_index = {host: i for i, host in enumerate(tfidf["host"].unique().sort().to_list())} n_hosts = len(host_to_index) m = lil_matrix((n_tags, n_hosts), dtype=float) for row in df.iter_rows(named=True): m[tag_to_index[row["tags"]], host_to_index[row["host"]]] = row["tf_idf"] return m class TagData: def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1): self.servers = servers self.n_tags = n_tags all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter( #all_tag_posts = read_tag_posts.filter( pl.col("created_at") >= pl.date(2023, 2, 1) ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter( pl.col("host").is_in(servers) ) all_tag_posts_topn = all_tag_posts.explode("tags").unique(["host", "acct", "tags"]).group_by(["host", "tags"]).agg([ pl.col("id").len().alias("accounts"), # How many accounts on the server are using this tag? ]).sort("accounts", descending=True).with_columns(pl.lit(1).alias("counter")).with_columns( pl.col("counter").cumsum().over("host").alias("running_count") ).filter(pl.col("running_count") <= n_tags).drop("counter", "running_count").filter(pl.col("accounts") >= min_server_accounts) self._all_tag_posts_topn = all_tag_posts_topn self._server_accounts = all_tag_posts_topn.group_by("host").agg([ pl.sum("accounts").alias("accounts_sum"), # The total number of account-tag pairs ])#.filter(pl.col("server_accounts") >= 10) #self._server_accounts = all_tag_posts.unique(["host", "acct"]).group_by("host").agg([ # pl.col("acct").len().alias("accounts_sum"), # The total number of accounts on the server #]) self._most_seen_tags = self._all_tag_posts_topn.group_by("tags").agg([ pl.sum("accounts").alias("total_accounts"), # account sum, how many accounts are using this tag excluding those on servers where they are the only ones pl.col("accounts").len().alias("server_count") # server count, how many servers are using this tag? ]).sort("server_count", descending=True)#.filter(pl.col("server_count") >= 3).filter(pl.col("total_accounts") >= 10) self.tag_to_index = {tag: i for i, tag in enumerate(self._all_tag_posts_topn["tags"].unique().sort().to_list())} self.host_to_index = {host: i for i, host in enumerate(self._all_tag_posts_topn["host"].unique().sort().to_list())} def server_accounts(self, n=10): return self._server_accounts.filter(pl.col("accounts_sum") >= n) def most_seen_tags(self, n_servers=3, n_accounts=10): return self._most_seen_tags.filter(pl.col("server_count") >= n_servers).filter(pl.col("total_accounts") >= n_accounts) def tfidf(self, n_server_accounts=5, n_servers=3, n_accounts=10): most_seen_tags = self.most_seen_tags(n_servers, n_accounts) server_accounts = self.server_accounts(n_server_accounts) tf = self._all_tag_posts_topn.join( most_seen_tags, on="tags", how="inner" ).join( server_accounts, on="host", how="inner" ).with_columns( (pl.col("accounts") / pl.col("accounts_sum")).alias("tf") ) n_servers = len(self._all_tag_posts_topn.unique("host")) idf = most_seen_tags.with_columns((n_servers/pl.col("server_count")).alias("idf")) tfidf = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True) return tfidf # Constraint: What if we only consider the _top_ 100 tags from each server? # Server clusters work quite well! # Tag clusters? #tag_simiarlity = cosine_similarity(full_mat.tocsr()) #tag_simiarlity[td.tag_to_index["ai"]] #np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["ai"]])][0:10] #np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["mastoart"]])][0:10] #baseline = np.argsort(-host_simiarlity[host_to_index["hci.social"]]) def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData): baseline_keys = set(baseline_td.host_to_index.keys()) server_samples = set(host_list.filter( pl.col("host").is_in(baseline_keys) ).sample(n=n_servers-1)["host"].to_list()) server_is = [baseline_td.host_to_index[i] for i in server_samples] sampled_server_indices = np.array(server_is) tagdata = TagData(server_samples, n_tags, min_server_accounts=5) tfidf = tagdata.tfidf(n_server_accounts=5, n_servers=3, n_accounts=10)#n_server_accounts=0, n_servers=2, n_accounts=1) m = built_tfidf_matrix(tfidf, baseline_td.tag_to_index, baseline_td.host_to_index) host_sim = cosine_similarity(m.tocsr().T) rs = [] for serv in server_samples: comp_server_index = baseline_td.host_to_index[serv] bl = np.argsort(-baseline[comp_server_index][sampled_server_indices]) comparison = np.argsort(-host_sim[comp_server_index][sampled_server_indices]) reference_ranks = {x: i for i, x in enumerate(bl)} current_ranks = [reference_ranks[x] for x in comparison] r = rbo.RankingSimilarity(list(range(len(current_ranks)))[1:], current_ranks[1:]).rbo(p=0.80, k=16, ext=True) rs.append(r) return rs def run_simulations(): #read_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather") server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list()) td = TagData(server_samples, 1_000_000, min_server_accounts=5) tfidf = td.tfidf() baseline_host_to_index = td.host_to_index full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index) baseline_similarlity = cosine_similarity(full_mat.tocsr().T) #np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["hci.social"]])][0:10] #np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["urbanists.social"]])][0:10] host_list = pl.scan_ipc( "data/scratch/all_tag_posts.feather" ).select("host").unique().collect() runs = [] for server_sizes in [256, 128, 64, 32]: # for tag_counts in [4096, 2048, 1028, 512, 256, 128, 64, 32, 16, 8, 4]: for run in range(128): print(server_sizes, tag_counts, run) s = sampler(host_list, server_sizes, tag_counts, baseline_similarlity, td) runs.append(pl.DataFrame({"servers": server_sizes, "tags": tag_counts, "run": run, "rbo": s})) print(np.mean(s)) all_runs = pl.concat(runs) all_runs.write_ipc("data/scratch/simulation_rbo.feather") jm = pl.read_json("data/joinmastodon-2023-08-25.json") jm_servers = set(jm["domain"].unique().to_list()) jm_td = td = TagData(jm_servers, 32, min_server_accounts=5) jm_tfidf = jm_td.tfidf(n_server_accounts=5, n_servers=3, n_accounts=10) mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index) similarlity = cosine_similarity(mat.tocsr().T) tag_sm = cosine_similarity(mat.tocsr()) tag_index_included = (np.sum(tag_sm, axis=0) > 0) included_tag_strings = np.array(list(jm_td.tag_to_index.keys()))[tag_index_included] tag_sm_matrix = tag_sm[np.ix_(tag_index_included, tag_index_included)] # import Affinity Prop from sklearn.cluster import AffinityPropagation ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm_matrix) clusters = pl.DataFrame({"tag": included_tag_strings, "cluster": ap.labels_}) # select a random element from each cluster clusters.group_by("cluster").agg([pl.col("tag").shuffle().first().alias("tag")]).sort("cluster")["tag"].to_list() example_topics = ["tech", "linux", "hacking", "gamedev"] example_indices = [s in example_topics for s in included_tag_strings] similar_servers = cosine_similarity(np.array(example_indices).reshape(-1,1).T, mat[np.ix_(tag_index_included)].T) np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similar_servers[0])][0:10] #np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similarlity[jm_td.host_to_index["historians.social"]])][0:10] #np.array(list(jm_td.host_to_index.keys()))[np.where(np.sum(mat, axis=0) < 0.01)[1]]