junior-sheer/FediRecommender/TagData.py

import polars as pl
from scipy.sparse import lil_matrix

def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix:
  """
  Helper function to build a sparse matrix from the tf-idf dataframe.
  """
  n_tags = len(tag_to_index)
  n_hosts = len(host_to_index)
  m = lil_matrix((n_tags, n_hosts), dtype=float)
  for row in df.iter_rows(named=True):
    m[tag_to_index[row["tags"]], host_to_index[row["host"]]] = row["tf_idf"]
  return m

def get_all_tag_posts(servers, data_dir="data/"):
  return pl.read_ipc(f"{data_dir}/scratch/all_tag_posts.feather").filter(
      pl.col("created_at") >= pl.date(2023, 5, 1)
    ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
      pl.col("host").is_in(servers)
    )

class TagData:
  def get_all_tag_posts(servers, data_dir="data"):
    return pl.read_ipc(f"{data_dir}/scratch/all_tag_posts.feather").filter(
        pl.col("created_at") >= pl.date(2023, 5, 1)
      ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
        pl.col("host").is_in(servers)
      )
  def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1, data_dir: str = "data"):
    self.data_dir = data_dir
    # TODO: minimum tags from server to be included?
    self.servers = servers
    self.n_tags = n_tags
    all_tag_posts = TagData.get_all_tag_posts(servers, self.data_dir)
    all_tag_posts_topn = all_tag_posts.explode("tags").unique(["host", "acct", "tags"]).group_by(["host", "tags"]).agg([
      pl.col("id").len().alias("accounts"), # How many accounts on the server are using this tag?
    ]).sort(["accounts", "tags"], descending=True).with_columns(pl.lit(1).alias("counter")).with_columns(
      pl.col("counter").cumsum().over("host").alias("running_count")
    ).filter(pl.col("running_count") <= n_tags).drop("counter", "running_count").filter(pl.col("accounts") >= min_server_accounts)
    self._all_tag_posts_topn = all_tag_posts_topn
    self._server_accounts = all_tag_posts_topn.group_by("host").agg([
      pl.col("tags").len().alias("server_tag_count"), # The total number tags on the server
      pl.sum("accounts").alias("accounts_sum"), # The total number of account-tag pairs
    ])#.filter(pl.col("server_accounts") >= 10)
    #self._server_accounts = all_tag_posts.unique(["host", "acct"]).group_by("host").agg([
    #  pl.col("acct").len().alias("accounts_sum"), # The total number of accounts on the server
    #])
    self._most_seen_tags = self._all_tag_posts_topn.group_by("tags").agg([
      pl.sum("accounts").alias("total_accounts"),  # account sum, how many accounts are using this tag excluding those on servers where they are the only ones
      pl.col("accounts").len().alias("server_count") # server count, how many servers are using this tag?
    ]).sort("server_count", descending=True)#.filter(pl.col("server_count") >= 3).filter(pl.col("total_accounts") >= 10)
    self.tag_to_index = {tag: i for i, tag in enumerate(self._all_tag_posts_topn["tags"].unique().sort().to_list())}
    self.host_to_index = {host: i for i, host in enumerate(self._all_tag_posts_topn["host"].unique().sort().to_list())}
  def server_accounts(self, n=10):
    return self._server_accounts.filter(pl.col("accounts_sum") >= n)
  def most_seen_tags(self, n_servers=3, n_accounts=10):
    return self._most_seen_tags.filter(pl.col("server_count") >= n_servers).filter(pl.col("total_accounts") >= n_accounts)
  def tfidf(self, n_server_accounts=5, n_servers=3, n_accounts=10):
    """
    TF-IDF algorithm.
    """
    most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
    server_accounts = self.server_accounts(n_server_accounts)
    tf = self._all_tag_posts_topn.join(
      most_seen_tags, on="tags", how="inner"
    ).join(
      server_accounts, on="host", how="inner"
    ).with_columns(
      (pl.col("accounts") / pl.col("accounts_sum")).alias("tf")
    )
    num_servers = len(self._all_tag_posts_topn.unique("host"))
    idf = most_seen_tags.with_columns(((1 + num_servers)/(1 + pl.col("server_count"))).log().alias("idf"))
    tfidf = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
    return tfidf

  def bm(self, n_server_accounts=5, n_servers=3, n_accounts=10):
    """
    BM25 algorithm.
    """
    k = 1.2
    b = 0.75
    most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
    server_accounts = self.server_accounts(n_server_accounts)
    num_servers = len(self._all_tag_posts_topn.unique("host"))
    D = server_accounts.rename({"accounts_sum": "D"}).with_columns((pl.col("D") / pl.col("D").mean()).alias("nd"))
    tf = self._all_tag_posts_topn.join(D, on="host", how="inner").with_columns(
      ((pl.col("accounts") * (k + 1))/(pl.col("accounts") + k*(1-b+b*pl.col("nd")))).alias("tf")
    )
    idf = most_seen_tags.with_columns(
      (1 + (num_servers - pl.col("server_count") + 0.5)/((pl.col("server_count") + 0.5))).log().alias("idf")
    )
    bm = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
    return bm