145 lines
8.1 KiB
Python
145 lines
8.1 KiB
Python
import polars as pl
|
|
from scipy.sparse import lil_matrix
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
#import textdistance
|
|
#from scipy.stats import kendalltau
|
|
import rbo
|
|
import scipy
|
|
|
|
def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix:
|
|
#tag_to_index = {tag: i for i, tag in enumerate(tfidf["tags"].unique().sort().to_list())}
|
|
n_tags = len(tag_to_index)
|
|
#host_to_index = {host: i for i, host in enumerate(tfidf["host"].unique().sort().to_list())}
|
|
n_hosts = len(host_to_index)
|
|
m = lil_matrix((n_tags, n_hosts), dtype=float)
|
|
for row in df.iter_rows(named=True):
|
|
m[tag_to_index[row["tags"]], host_to_index[row["host"]]] = row["tf_idf"]
|
|
return m
|
|
|
|
class TagData:
|
|
def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1):
|
|
# TODO: minimum tags from server to be included?
|
|
self.servers = servers
|
|
self.n_tags = n_tags
|
|
all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter(
|
|
#all_tag_posts = read_tag_posts.filter(
|
|
pl.col("created_at") >= pl.date(2023, 5, 1)
|
|
).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
|
|
pl.col("host").is_in(servers)
|
|
)
|
|
all_tag_posts_topn = all_tag_posts.explode("tags").unique(["host", "acct", "tags"]).group_by(["host", "tags"]).agg([
|
|
pl.col("id").len().alias("accounts"), # How many accounts on the server are using this tag?
|
|
]).sort(["accounts", "tags"], descending=True).with_columns(pl.lit(1).alias("counter")).with_columns(
|
|
pl.col("counter").cumsum().over("host").alias("running_count")
|
|
).filter(pl.col("running_count") <= n_tags).drop("counter", "running_count").filter(pl.col("accounts") >= min_server_accounts)
|
|
self._all_tag_posts_topn = all_tag_posts_topn
|
|
self._server_accounts = all_tag_posts_topn.group_by("host").agg([
|
|
pl.col("tags").len().alias("server_tag_count"), # The total number tags on the server
|
|
pl.sum("accounts").alias("accounts_sum"), # The total number of account-tag pairs
|
|
])#.filter(pl.col("server_accounts") >= 10)
|
|
#self._server_accounts = all_tag_posts.unique(["host", "acct"]).group_by("host").agg([
|
|
# pl.col("acct").len().alias("accounts_sum"), # The total number of accounts on the server
|
|
#])
|
|
self._most_seen_tags = self._all_tag_posts_topn.group_by("tags").agg([
|
|
pl.sum("accounts").alias("total_accounts"), # account sum, how many accounts are using this tag excluding those on servers where they are the only ones
|
|
pl.col("accounts").len().alias("server_count") # server count, how many servers are using this tag?
|
|
]).sort("server_count", descending=True)#.filter(pl.col("server_count") >= 3).filter(pl.col("total_accounts") >= 10)
|
|
self.tag_to_index = {tag: i for i, tag in enumerate(self._all_tag_posts_topn["tags"].unique().sort().to_list())}
|
|
self.host_to_index = {host: i for i, host in enumerate(self._all_tag_posts_topn["host"].unique().sort().to_list())}
|
|
def server_accounts(self, n=10):
|
|
return self._server_accounts.filter(pl.col("accounts_sum") >= n)
|
|
def most_seen_tags(self, n_servers=3, n_accounts=10):
|
|
return self._most_seen_tags.filter(pl.col("server_count") >= n_servers).filter(pl.col("total_accounts") >= n_accounts)
|
|
def tfidf(self, n_server_accounts=5, n_servers=3, n_accounts=10):
|
|
most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
|
|
server_accounts = self.server_accounts(n_server_accounts)
|
|
tf = self._all_tag_posts_topn.join(
|
|
most_seen_tags, on="tags", how="inner"
|
|
).join(
|
|
server_accounts, on="host", how="inner"
|
|
).with_columns(
|
|
(pl.col("accounts") / pl.col("accounts_sum")).alias("tf")
|
|
)
|
|
num_servers = len(self._all_tag_posts_topn.unique("host"))
|
|
idf = most_seen_tags.with_columns(((1 + num_servers)/(1 + pl.col("server_count"))).log().alias("idf"))
|
|
tfidf = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
|
|
return tfidf
|
|
def bm(self, n_server_accounts=5, n_servers=3, n_accounts=10):
|
|
k = 1.2
|
|
b = 0.75
|
|
most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
|
|
server_accounts = self.server_accounts(n_server_accounts)
|
|
num_servers = len(self._all_tag_posts_topn.unique("host"))
|
|
D = server_accounts.rename({"accounts_sum": "D"}).with_columns((pl.col("D") / pl.col("D").mean()).alias("nd"))
|
|
tf = self._all_tag_posts_topn.join(D, on="host", how="inner").with_columns(
|
|
((pl.col("accounts") * (k + 1))/(pl.col("accounts") + k*(1-b+b*pl.col("nd")))).alias("tf")
|
|
)
|
|
idf = most_seen_tags.with_columns(
|
|
(1 + (num_servers - pl.col("server_count") + 0.5)/((pl.col("server_count") + 0.5))).log().alias("idf")
|
|
)
|
|
bm = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
|
|
return bm
|
|
|
|
# Constraint: What if we only consider the _top_ 100 tags from each server?
|
|
|
|
|
|
# Server clusters work quite well!
|
|
|
|
# Tag clusters?
|
|
#tag_simiarlity = cosine_similarity(full_mat.tocsr())
|
|
#tag_simiarlity[td.tag_to_index["ai"]]
|
|
#np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["ai"]])][0:10]
|
|
#np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["mastoart"]])][0:10]
|
|
#baseline = np.argsort(-host_simiarlity[host_to_index["hci.social"]])
|
|
|
|
def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData):
|
|
baseline_keys = set(baseline_td.host_to_index.keys())
|
|
server_samples = set(host_list.filter(
|
|
pl.col("host").is_in(baseline_keys)
|
|
).sample(n=n_servers-1)["host"].to_list())
|
|
server_is = [baseline_td.host_to_index[i] for i in server_samples]
|
|
sampled_server_indices = np.array(server_is)
|
|
tagdata = TagData(server_samples, n_tags, min_server_accounts=2)
|
|
tfidf = tagdata.bm(n_server_accounts=5, n_servers=3, n_accounts=10)#n_server_accounts=0, n_servers=2, n_accounts=1)
|
|
full_mat = built_tfidf_matrix(tfidf, baseline_td.tag_to_index, baseline_td.host_to_index).T
|
|
m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one
|
|
host_sim = cosine_similarity(m)
|
|
rs = []
|
|
for serv in server_samples:
|
|
comp_server_index = baseline_td.host_to_index[serv]
|
|
bl = np.argsort(-baseline[comp_server_index][sampled_server_indices])
|
|
comparison = np.argsort(-host_sim[comp_server_index][sampled_server_indices])
|
|
reference_ranks = {x: i for i, x in enumerate(bl)}
|
|
current_ranks = [reference_ranks[x] for x in comparison]
|
|
r = rbo.RankingSimilarity(list(range(len(current_ranks)))[1:], current_ranks[1:]).rbo(p=0.80, k=16, ext=True)
|
|
rs.append(r)
|
|
return rs
|
|
|
|
def run_simulations():
|
|
#read_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather")
|
|
server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
|
|
#td = TagData(server_samples, 1_000_000, min_server_accounts=2)
|
|
#tfidf = td.bm(n_server_accounts=5, n_servers=3, n_accounts=10)
|
|
td = TagData(server_samples, 256, min_server_accounts=2)
|
|
tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)
|
|
baseline_host_to_index = td.host_to_index
|
|
full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T
|
|
m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one
|
|
baseline_similarlity = cosine_similarity(m)
|
|
#np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["hci.social"]])][0:10]
|
|
#np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["urbanists.social"]])][0:10]
|
|
host_list = pl.scan_ipc(
|
|
"data/scratch/all_tag_posts.feather"
|
|
).select("host").unique().collect()
|
|
runs = []
|
|
for server_sizes in [256, 128, 64, 32]: #
|
|
for tag_counts in [256, 128, 64, 32, 16, 8]:
|
|
for run in range(128):
|
|
print(server_sizes, tag_counts, run)
|
|
s = sampler(host_list, server_sizes, tag_counts, baseline_similarlity, td)
|
|
runs.append(pl.DataFrame({"servers": server_sizes, "tags": tag_counts, "run": run, "rbo": s}))
|
|
print(np.mean(s))
|
|
all_runs = pl.concat(runs)
|
|
all_runs.write_ipc("data/scratch/simulation_rbo.feather")
|