Refactor scripts and code.

Split manuscripts into their own directories / projects.
2025-05-26 20:08:57 -05:00 · 2025-05-26 20:08:57 -05:00 · 60023b07d1
commit 60023b07d1
parent ecb8a55253
46 changed files with 2494 additions and 670 deletions
--- a/.gitignore
+++ b/.gitignore
@ -44,3 +44,4 @@ _targets.yaml
 __pycache__/
 *.pyc

+_freeze-old/
--- a/FediRecommender/README.md
+++ b/FediRecommender/README.md
@ -0,0 +1,3 @@
+# FediRecommender
+
+Python package for building and running the recommendation system.
--- a/FediRecommender/TagData.py
+++ b/FediRecommender/TagData.py
@ -0,0 +1,93 @@
+import polars as pl
+from scipy.sparse import lil_matrix
+
+def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix:
+  """
+  Helper function to build a sparse matrix from the tf-idf dataframe.
+  """
+  n_tags = len(tag_to_index)
+  n_hosts = len(host_to_index)
+  m = lil_matrix((n_tags, n_hosts), dtype=float)
+  for row in df.iter_rows(named=True):
+    m[tag_to_index[row["tags"]], host_to_index[row["host"]]] = row["tf_idf"]
+  return m
+
+def get_all_tag_posts(servers, data_dir="data/"):
+  return pl.read_ipc(f"{data_dir}/scratch/all_tag_posts.feather").filter(
+      pl.col("created_at") >= pl.date(2023, 5, 1)
+    ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
+      pl.col("host").is_in(servers)
+    )
+
+class TagData:
+  def get_all_tag_posts(servers, data_dir="data"):
+    return pl.read_ipc(f"{data_dir}/scratch/all_tag_posts.feather").filter(
+        pl.col("created_at") >= pl.date(2023, 5, 1)
+      ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
+        pl.col("host").is_in(servers)
+      )
+  def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1, data_dir: str = "data"):
+    self.data_dir = data_dir
+    # TODO: minimum tags from server to be included?
+    self.servers = servers
+    self.n_tags = n_tags
+    all_tag_posts = TagData.get_all_tag_posts(servers, self.data_dir)
+    all_tag_posts_topn = all_tag_posts.explode("tags").unique(["host", "acct", "tags"]).group_by(["host", "tags"]).agg([
+      pl.col("id").len().alias("accounts"), # How many accounts on the server are using this tag?
+    ]).sort(["accounts", "tags"], descending=True).with_columns(pl.lit(1).alias("counter")).with_columns(
+      pl.col("counter").cumsum().over("host").alias("running_count")
+    ).filter(pl.col("running_count") <= n_tags).drop("counter", "running_count").filter(pl.col("accounts") >= min_server_accounts)
+    self._all_tag_posts_topn = all_tag_posts_topn
+    self._server_accounts = all_tag_posts_topn.group_by("host").agg([
+      pl.col("tags").len().alias("server_tag_count"), # The total number tags on the server
+      pl.sum("accounts").alias("accounts_sum"), # The total number of account-tag pairs
+    ])#.filter(pl.col("server_accounts") >= 10)
+    #self._server_accounts = all_tag_posts.unique(["host", "acct"]).group_by("host").agg([
+    #  pl.col("acct").len().alias("accounts_sum"), # The total number of accounts on the server
+    #])
+    self._most_seen_tags = self._all_tag_posts_topn.group_by("tags").agg([
+      pl.sum("accounts").alias("total_accounts"),  # account sum, how many accounts are using this tag excluding those on servers where they are the only ones
+      pl.col("accounts").len().alias("server_count") # server count, how many servers are using this tag?
+    ]).sort("server_count", descending=True)#.filter(pl.col("server_count") >= 3).filter(pl.col("total_accounts") >= 10)
+    self.tag_to_index = {tag: i for i, tag in enumerate(self._all_tag_posts_topn["tags"].unique().sort().to_list())}
+    self.host_to_index = {host: i for i, host in enumerate(self._all_tag_posts_topn["host"].unique().sort().to_list())}
+  def server_accounts(self, n=10):
+    return self._server_accounts.filter(pl.col("accounts_sum") >= n)
+  def most_seen_tags(self, n_servers=3, n_accounts=10):
+    return self._most_seen_tags.filter(pl.col("server_count") >= n_servers).filter(pl.col("total_accounts") >= n_accounts)
+  def tfidf(self, n_server_accounts=5, n_servers=3, n_accounts=10):
+    """
+    TF-IDF algorithm.
+    """
+    most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
+    server_accounts = self.server_accounts(n_server_accounts)
+    tf = self._all_tag_posts_topn.join(
+      most_seen_tags, on="tags", how="inner"
+    ).join(
+      server_accounts, on="host", how="inner"  
+    ).with_columns(
+      (pl.col("accounts") / pl.col("accounts_sum")).alias("tf")
+    )
+    num_servers = len(self._all_tag_posts_topn.unique("host"))
+    idf = most_seen_tags.with_columns(((1 + num_servers)/(1 + pl.col("server_count"))).log().alias("idf"))
+    tfidf = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
+    return tfidf
+
+  def bm(self, n_server_accounts=5, n_servers=3, n_accounts=10):
+    """
+    BM25 algorithm.
+    """
+    k = 1.2
+    b = 0.75
+    most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
+    server_accounts = self.server_accounts(n_server_accounts)
+    num_servers = len(self._all_tag_posts_topn.unique("host"))
+    D = server_accounts.rename({"accounts_sum": "D"}).with_columns((pl.col("D") / pl.col("D").mean()).alias("nd"))
+    tf = self._all_tag_posts_topn.join(D, on="host", how="inner").with_columns(
+      ((pl.col("accounts") * (k + 1))/(pl.col("accounts") + k*(1-b+b*pl.col("nd")))).alias("tf")
+    )
+    idf = most_seen_tags.with_columns(
+      (1 + (num_servers - pl.col("server_count") + 0.5)/((pl.col("server_count") + 0.5))).log().alias("idf")
+    )
+    bm = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
+    return bm
--- a/FediRecommender/init.py
+++ b/FediRecommender/init.py
@ -0,0 +1,2 @@
+from .federated_design import built_tfidf_matrix
+from .TagData import TagData
--- a/FediRecommender/build_suggestion.py
+++ b/FediRecommender/build_suggestion.py
@ -1,4 +1,5 @@
-from federated_design import *
+#from federated_design import built_tfidf_matrix, TagData
+import federated_design
 from scipy.sparse.linalg import svds
 from sklearn.preprocessing import normalize
 import polars as pl
@ -6,35 +7,37 @@ from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.metrics.pairwise import euclidean_distances
 from sklearn.decomposition import PCA
 import json
+import numpy as np

 class ReccModel:
-  def __init__(self):
-    jm = pl.read_json("data/joinmastodon-2023-08-25.json")
+  def __init__(self, data_dir="data/"):
+    self.data_dir = data_dir
+    jm = pl.read_json(f"{self.data_dir}joinmastodon-2023-08-25.json")
    jm_servers = set(jm["domain"].unique().to_list())
-    self.td = TagData(jm_servers, 256, min_server_accounts=2)
+    self.td = federated_design.TagData(self.data_dir, jm_servers, 256, min_server_accounts=2)
    # Build the tfidf matrix
    self.tfidf = self.td.bm(n_server_accounts=0, n_servers=2, n_accounts=5)
-    self.mat = built_tfidf_matrix(self.tfidf, self.td.tag_to_index, self.td.host_to_index)
+    self.mat = federated_design.built_tfidf_matrix(self.tfidf, self.td.tag_to_index, self.td.host_to_index)
    #self.tag_use_counts = np.sum(self.mat > 0, axis=1).T
    self.tag_use_counts = np.array([self.mat.getrow(i).getnnz() for i in range(self.mat.shape[0])])
    self.has_info = (self.tag_use_counts >= 2).tolist()
    self.tag_names = np.array(list(self.td.tag_to_index.keys()))[self.has_info]
    self.server_has_info = (np.sum(self.mat[self.has_info], axis=0) > 0).tolist()[0]
    self.server_names = np.array(list(self.td.host_to_index.keys()))[self.server_has_info]
-    self.m_selected = normalize(self.mat.T.tocsr()[:, self.has_info][self.server_has_info], norm="l2", axis=1)
+    m = self.mat.T.tocsr()[:, self.has_info][self.server_has_info]
+    self.m_selected = m#normalize(m, norm="l2", axis=1)
    #self.svd = TruncatedSVD(n_components=50, n_iter=25, random_state=42).fit(self.m_selected)
  def svd(self, k=50, norm_axis=None):
    m = self.m_selected
    if norm_axis is not None:
      m = normalize(m, norm="l2", axis = norm_axis)
-    u, s, v = svds(self.m_selected, k=k, which="LM")
+    u, s, v = svds(m, k=k, which="LM")
    return u, s, v
  def top_tags(self):
-    u, s, v = self.svd(k=25)
+    _, s, v = self.svd(k=25, norm_axis=1)
    tag_stuff = np.diag(s) @ v
    pca = PCA(n_components=2)
    tag_pca = pca.fit_transform(tag_stuff.T)
-    print(tag_pca[:, 0])
    return pl.DataFrame({
      "tag": self.tag_names,
      "x": tag_pca[:, 0],
@ -63,10 +66,10 @@ def sim_from_tag_index(rm: ReccModel, index=1000):
  return server_sim

 if __name__ == "__main__":
-  rm = ReccModel()
+  rm = ReccModel(data_dir="data/")
  rm.top_tags().write_ipc("data/scratch/tag_svd.feather")
  rm.top_servers().write_ipc("data/scratch/server_svd.feather")
-  u, s, v = rm.svd(k=50, norm_axis=None)
+  u, s, v = rm.svd(k=50, norm_axis=1)
  #pos_m = v.T#(v.T @ np.diag(s))#v.T#
  pos_m = v.T @ np.diag(s)
  server_matrix = u#u @ np.diag(s)#u#
--- a/FediRecommender/federated_design.py
+++ b/FediRecommender/federated_design.py
@ -2,12 +2,13 @@ import polars as pl
 from scipy.sparse import lil_matrix
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
-#import textdistance
-#from scipy.stats import kendalltau
 import rbo
 import scipy

 def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix:
+  """
+  Helper function to build a sparse matrix from the tf-idf dataframe.
+  """
  #tag_to_index = {tag: i for i, tag in enumerate(tfidf["tags"].unique().sort().to_list())}
  n_tags = len(tag_to_index)
  #host_to_index = {host: i for i, host in enumerate(tfidf["host"].unique().sort().to_list())}
@ -18,16 +19,18 @@ def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_mat
  return m

 class TagData:
-  def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1):
+  def get_all_tag_posts(servers, data_dir="data/"):
+    return pl.read_ipc(f"{data_dir}/scratch/all_tag_posts.feather").filter(
+        pl.col("created_at") >= pl.date(2023, 5, 1)
+      ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
+        pl.col("host").is_in(servers)
+      )
+  def __init__(self, data_dir: str, servers: set[str], n_tags: int, min_server_accounts: int = 1):
+    self.data_dir = data_dir
    # TODO: minimum tags from server to be included?
    self.servers = servers
    self.n_tags = n_tags
-    all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter(
-    #all_tag_posts = read_tag_posts.filter(
-      pl.col("created_at") >= pl.date(2023, 5, 1)
-    ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
-      pl.col("host").is_in(servers)
-    )
+    all_tag_posts = TagData.get_all_tag_posts(servers, self.data_dir)
    all_tag_posts_topn = all_tag_posts.explode("tags").unique(["host", "acct", "tags"]).group_by(["host", "tags"]).agg([
      pl.col("id").len().alias("accounts"), # How many accounts on the server are using this tag?
    ]).sort(["accounts", "tags"], descending=True).with_columns(pl.lit(1).alias("counter")).with_columns(
@ -81,18 +84,6 @@ class TagData:
    bm = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
    return bm

-# Constraint: What if we only consider the _top_ 100 tags from each server?
-
-
-# Server clusters work quite well!
-
-# Tag clusters?
-#tag_simiarlity = cosine_similarity(full_mat.tocsr())
-#tag_simiarlity[td.tag_to_index["ai"]]
-#np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["ai"]])][0:10]
-#np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["mastoart"]])][0:10]
-#baseline = np.argsort(-host_simiarlity[host_to_index["hci.social"]])
-
 def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData):
  baseline_keys = set(baseline_td.host_to_index.keys())
  server_samples = set(host_list.filter(
@ -100,7 +91,7 @@ def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData):
    ).sample(n=n_servers-1)["host"].to_list())
  server_is = [baseline_td.host_to_index[i] for i in server_samples]
  sampled_server_indices = np.array(server_is)
-  tagdata = TagData(server_samples, n_tags, min_server_accounts=2)
+  tagdata = TagData("data/", server_samples, n_tags, min_server_accounts=2)
  tfidf = tagdata.bm(n_server_accounts=5, n_servers=3, n_accounts=10)#n_server_accounts=0, n_servers=2, n_accounts=1)
  full_mat = built_tfidf_matrix(tfidf, baseline_td.tag_to_index, baseline_td.host_to_index).T
  m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one
@ -121,7 +112,7 @@ def run_simulations():
  server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
  #td = TagData(server_samples, 1_000_000, min_server_accounts=2)
  #tfidf = td.bm(n_server_accounts=5, n_servers=3, n_accounts=10)
-  td = TagData(server_samples, 256, min_server_accounts=2)
+  td = TagData("data/", server_samples, 256, min_server_accounts=2)
  tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)
  baseline_host_to_index = td.host_to_index
  full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+scripts:
+  python3 -m scripts.similar_servers
+  python -m scripts.all_tag_posts
+
+manuscripts:
+	quarto render manuscripts/ic2s2-2024
+
 acm:
 	quarto render --profile acm

--- a/README.md
+++ b/README.md
@ -14,6 +14,8 @@

 ### Environment Setup

+I created this project with R version 4.4.1 and Python version 3.9.22.
+
 This project uses [renv](https://rstudio.github.io/renv/) to manage R package dependencies. To set up the environment, run the following command in R:

 ```r
--- a/_quarto.yml
+++ b/_quarto.yml
@ -3,8 +3,6 @@ project:
  output-dir: "_out"
  preview:
    port: 2059
-    #host: 0.0.0.0
-    #browser: false
    watch-inputs: true
 format:
  acm-html:
@ -14,11 +12,11 @@ format:
    include-in-header:
      - text: |
          \usepackage{siunitx}
-  docx: default
+  #docx: default
 profile:
  default: acm
 manuscript:
-  article: acm.qmd
+  article: index.qmd
  code-links:
    - text: Preprocessing
      href: code/preprocess.py
--- a/code/recc/init.py
+++ b/code/recc/init.py
@ -1,2 +0,0 @@
-from .build_suggestion import ReccModel, sim_from_tag_index
-from .federated_design import TagData
--- a/codebase/R/helpers.R
+++ b/codebase/R/helpers.R
--- a/codebase/R/survival.R
+++ b/codebase/R/survival.R
@ -5,7 +5,8 @@ library(ggsurvfit)
 library(coxme)
 library(jsonlite)

-source(here("code/helpers.R"))
+source(here("codebase/R/helpers.R"))
+
 options(arrow.skip_nul = TRUE)

 active_period <- 91
@ -52,7 +53,7 @@ general_servers <- c(
  "ohai.social"
 ) # > 100 cohort size + strength > .75

-server_centrality <- arrow::read_ipc_file("data/scratch/server_centrality.feather") %>%
+server_centrality <- arrow::read_ipc_file(here("data/scratch/server_centrality.feather")) %>%
  rename(generality = strength)

 sel_a <- a %>%
--- a/codebase/init.py
+++ b/codebase/init.py
@ -0,0 +1 @@
+
--- a/codebase/evaluation.ipynb
+++ b/codebase/evaluation.ipynb
@ -0,0 +1,337 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluation of the Recommender System"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "import polars as pl\n",
+    "sys.path.append('recc/')\n",
+    "from recc import build_suggestion\n",
+    "import numpy as np\n",
+    "from sklearn.metrics.pairwise import cosine_similarity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rm = build_suggestion.ReccModel(\"../data/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rm.tfidf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tags = rm.tfidf.select(pl.col(\"tags\")).unique()\n",
+    "hosts = rm.tfidf.select(pl.col(\"host\")).unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = pl.scan_ipc(\"../data/scratch/all_tag_posts.feather\").filter(\n",
+    "    pl.col(\"created_at\") >= pl.date(2023, 8, 1)\n",
+    ").filter(\n",
+    "    pl.col(\"created_at\") <= pl.date(2023, 8, 14)\n",
+    ").explode(\"tags\").filter(\n",
+    "    pl.col(\"tags\").is_in(tags)\n",
+    ").filter(\n",
+    "    pl.col(\"host\").is_in(hosts)\n",
+    ").unique(\n",
+    "    [\"host\", \"acct\", \"tags\"]\n",
+    ").select(\n",
+    "    [\"host\", \"acct\", \"tags\"]\n",
+    ").group_by(\n",
+    "    [\"host\", \"acct\"]\n",
+    ").agg([\n",
+    "    pl.col(\"tags\")#.list()\n",
+    "]).with_columns(\n",
+    "    pl.col(\"tags\").list.len().alias(\"n_tags\")\n",
+    ").collect().filter(pl.col(\"n_tags\") >= 2).filter(\n",
+    "    pl.col(\"n_tags\") <= 25\n",
+    ").sort(\"n_tags\", descending=True)\n",
+    "test_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ReccSVD50:\n",
+    "    def __init__(self, rm):\n",
+    "        self.rm = rm\n",
+    "        self.u, self.s, self.v = rm.svd(k=50, norm_axis=1)\n",
+    "        self.pos_m = self.v.T @ np.diag(self.s)\n",
+    "\n",
+    "    def recommend(self, tags: list[str]):\n",
+    "        tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n",
+    "        cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n",
+    "        cs_ind = np.argsort(-np.sum(cs, axis=0))\n",
+    "        return self.rm.server_names[cs_ind]\n",
+    "    \n",
+    "    def score(self, test_data):\n",
+    "        ranks = []\n",
+    "        for row in test_data.iter_rows(named=True):\n",
+    "            tags = row[\"tags\"]\n",
+    "            recc = self.recommend(tags)\n",
+    "            rank = np.where(recc == row[\"host\"])[0][0]\n",
+    "            ranks.append(rank)\n",
+    "        return ranks\n",
+    "    \n",
+    "class ReccRandom(ReccSVD50):\n",
+    "    def __init__(self, rm):\n",
+    "        self.rm = rm\n",
+    "\n",
+    "    def recommend(self, tags: list[str]):\n",
+    "        return np.random.permutation(self.rm.server_names)\n",
+    "    \n",
+    "class ReccAlternate(ReccSVD50):\n",
+    "    def recommend(self, tags: list[str]):\n",
+    "        tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n",
+    "        cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n",
+    "        cs_ind = np.argsort(-np.sum(cs, axis=0))\n",
+    "        orders = [cs_ind]\n",
+    "        for i in range(len(tags)):\n",
+    "            orders.append(np.argsort(-cs[i, :]))\n",
+    "            #print(np.argsort(-np.sum(cs[i, :], axis=1)))\n",
+    "        output = []\n",
+    "        for i in range(len(cs_ind)):\n",
+    "            for j in range(len(orders)):\n",
+    "                if orders[j][i] not in output:\n",
+    "                    output.append(orders[j][i])\n",
+    "        return self.rm.server_names[output]\n",
+    "        \n",
+    "score_rand = ReccRandom(rm).score(test_data)\n",
+    "scores_svd50 = ReccSVD50(rm).score(test_data)\n",
+    "scores_alt = ReccAlternate(rm).score(test_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ReccIDF(ReccSVD50):\n",
+    "    def __init__(self, rm):\n",
+    "        super().__init__(rm)\n",
+    "        self.idf = {}\n",
+    "        for row in rm.tfidf.select([\"tags\", \"idf\"]).unique().iter_rows(named=True):\n",
+    "            self.idf[row[\"tags\"]] = row[\"idf\"]\n",
+    "\n",
+    "    def recommend(self, tags: list[str]):\n",
+    "        idf_vec = np.array([self.idf[tag] for tag in tags])\n",
+    "        tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n",
+    "        cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n",
+    "        cs_ind = np.argsort(-np.sum(np.multiply(cs, idf_vec[:, np.newaxis]), axis=0))\n",
+    "        return self.rm.server_names[cs_ind]\n",
+    "\n",
+    "scores_idf = ReccIDF(rm).score(test_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.mean(scores_idf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.mean(scores_svd50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tdf = test_data.with_columns(\n",
+    "    rand = pl.Series(score_rand),\n",
+    "    svd = pl.Series(scores_svd50),\n",
+    "    alt = pl.Series(scores_alt),\n",
+    "    idf = pl.Series(scores_idf)\n",
+    ")\n",
+    "tdf.write_ipc(\"../data/scratch/svd50_eval.feather\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Moved Accounts\n",
+    "\n",
+    "Can we predict moved accounts?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "moved_accounts = pl.read_ipc(\"../data/scratch/individual_moved_accounts.feather\")\n",
+    "maccount_tag_posts = moved_accounts.join(\n",
+    "    pl.scan_ipc(\"../data/scratch/all_tag_posts.feather\").rename({\n",
+    "        \"acct\": \"account\",\n",
+    "        \"host\": \"server\"\n",
+    "    }).collect(),\n",
+    "    on=[\"account\", \"server\"],\n",
+    "    how='inner'\n",
+    ").explode(\"tags\").filter(\n",
+    "    pl.col(\"tags\").is_in(tags)\n",
+    ").filter(\n",
+    "    pl.col(\"moved_server\").is_in(hosts)\n",
+    ").unique(\n",
+    "    [\"moved_server\", \"moved_acct\", \"tags\"]\n",
+    ").select(\n",
+    "    [\"moved_server\", \"moved_acct\", \"tags\"]\n",
+    ").group_by(\n",
+    "    [\"moved_server\", \"moved_acct\"]\n",
+    ").agg([\n",
+    "    pl.col(\"tags\")#.list()\n",
+    "]).with_columns(\n",
+    "    pl.col(\"tags\").list.len().alias(\"n_tags\")\n",
+    ").filter(pl.col(\"n_tags\") >= 2).filter(\n",
+    "    pl.col(\"n_tags\") <= 1000\n",
+    ").sort(\"n_tags\", descending=True).rename({\n",
+    "    \"moved_server\": \"host\"\n",
+    "})\n",
+    "maccount_tag_posts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "score_rand_moved = ReccRandom(rm).score(maccount_tag_posts)\n",
+    "scores_svd50_moved = ReccSVD50(rm).score(maccount_tag_posts)\n",
+    "scores_alt_moved = ReccAlternate(rm).score(maccount_tag_posts)\n",
+    "scores_idf_moved = ReccIDF(rm).score(maccount_tag_posts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.mean(score_rand_moved)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.mean(scores_svd50_moved)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.median(scores_svd50_moved)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.value_counts(scores_svd50_moved)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.unique(scores_svd50_moved, return_counts=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(scores_svd50_moved)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.sum(np.array(scores_svd50_moved) < 10)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "renv-python-3.9",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.22"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/codebase/load_accounts.py
+++ b/codebase/load_accounts.py
--- a/codebase/old/hclust.py
+++ b/codebase/old/hclust.py
--- a/codebase/old/tags.py
+++ b/codebase/old/tags.py
--- a/codebase/old/topic_model_tags.py
+++ b/codebase/old/topic_model_tags.py
@ -144,7 +144,7 @@ def read_tags_file(file: str, accounts: set[str]) -> pl.DataFrame:



-from code.load_accounts import read_metadata_file
+from codebase.load_accounts import read_metadata_file
 metadata = read_metadata_file("data/metadata-2024-01-31.feather").select(pl.col(["server", "user_count"])).filter(pl.col("user_count") >= 100)
 accounts = pl.scan_ipc("data/scratch/all_accounts.feather").select(
  pl.col(["server", "acct", "bot", "noindex", "followers_count", "suspended"])
--- a/codebase/preprocess.py
+++ b/codebase/preprocess.py
--- a/codebase/scratch/federated_scratch.py
+++ b/codebase/scratch/federated_scratch.py
--- a/codebase/scratch/gensim.py
+++ b/codebase/scratch/gensim.py
--- a/codebase/scratch/scratch20240311.py
+++ b/codebase/scratch/scratch20240311.py
--- a/index.qmd
+++ b/index.qmd
@ -100,14 +100,24 @@ code-block-bg: false
 ```{r}
 #| label: setup

-profile <- Sys.getenv("QUARTO_PROFILE", unset="acm")
-if (profile == "acm") {
+my_profile <- Sys.getenv("QUARTO_PROFILE", unset="acm")
+if (my_profile == "acm") {
  class_wide <- ".column-body"
 } else {
  class_wide <- ".column-page"
 }

+library(here)
+
+get_here <- function(file) {
+  here::here(file)
+}
+
 envs <- Sys.getenv()
+
+library(modelsummary)
+# Revert to old modelsummary system for now.
+options(modelsummary_factory_default = 'kableExtra')
 ```

 # Introduction
@ -128,11 +138,11 @@ In response to these findings, we propose a potential way to create server and t

 The Fediverse is a set of decentralized online social networks which interoperate using shared protocols like ActivityPub. Mastodon is a software program used by many Fediverse servers and offers a user experience similar to the Tweetdeck client for Twitter. It was first created in late 2016 and saw a surge in interest in 2022 during and after Elon Musk's Twitter acquisition.

-Mastodon features three kinds of timelines. The primary timelines is a "home" timeline which shows all posts from accounts followed by the user. Mastodon also supports a "local" timeline which shows all public posts from the local server and a "federated" timeline which includes all posts from users followed by other users on their server. The local timeline is unique to each server and can be used to discover new accounts and posts from the local community. On larger servers, this timeline can be unwieldy; however, on smaller servers, this presents the opportunity to discover new posts and users of potential interest.
+Mastodon features three kinds of timelines. The primary timeline is a "home" timeline which shows all posts from accounts followed by the user. Mastodon also supports a "local" timeline which shows all public posts from the local server and a "federated" timeline which includes all posts from users followed by other users on their server. The local timeline is unique to each server and can be used to discover new accounts and posts from the local community. On larger servers, this timeline can be unwieldy; however, on smaller servers, this presents the opportunity to discover new posts and users of potential interest.

-Discovery has been challenging on Masotodon. Text search, for instance, was impossible on most servers until support for this feature was added on an optional, opt-in basis using Elasticsearch in late 2023 [@rochkoMastodon2023]. Recommendation systems are currently a somewhat novel problem in the context of decentralized online social networks. @trienesRecommendingUsersWhom2018 developed a recommendation system for finding new accounts to follow on the Fediverse which used collaborative filtering based on BM25 in an early example of a content discovery system on Mastodon.
+Discovery has been challenging on Mastodon. Text search, for instance, was impossible on most servers until support for this feature was added on an optional, opt-in basis using Elasticsearch in late 2023 [@rochkoMastodon2023]. Recommendation systems are currently a somewhat novel problem in the context of decentralized online social networks. @trienesRecommendingUsersWhom2018 developed a recommendation system for finding new accounts to follow on the Fediverse which used collaborative filtering based on BM25 in an early example of a content discovery system on Mastodon.

-Individual Mastodon servers can have an effect on the end experience of users. For example, some servers may choose to federate with some servers but not others, altering the topology of the Fediverse network for their users. At the same time, accounts need to be locked into one specific server. Because of Mastodon's data portability, users can move their accounts freely between servers while retaining their followers, though their post history remains with their original account.
+Individual Mastodon servers can have an effect on the end experience of users. For example, some servers may choose to federate with some servers but not others, altering the topology of the Fediverse network for their users. At the same time, accounts can only map to one specific server. Because of Mastodon's data portability, users can move their accounts freely between servers while retaining their followers, though their post history remains with their original account.

 ## The Mastodon Migrations

@ -150,13 +160,13 @@ The series of migrations of new users into Mastodon in many ways reflect folk st

 ## Recommendation Systems and Collaborative Filtering

-Recommender systems help people filter information to find resources releveant to some need [@ricciRecommenderSystemsHandbook2022]. The development of these systems as an area of formal study harkens back to information retrieval (e.g. @saltonIntroductionModernInformation1987) and foundational works imagining the role of computing in human decision-making (e.g. @bushWeMayThink1945). Early work on these systems produced more effective ways of filtering and sorting documents in searches such as the probabilistic models that motivated the creation of the okapi (BM25) relevance function [@robertsonProbabilisticRelevanceFramework2009]. Many contemporary recommendation systems use collaborative filtering, a technique which produces new recommendations for items based on the preferences of a collection of similar users [@korenAdvancesCollaborativeFiltering2022].
+Recommender systems help people filter information to find resources relevant to some need [@ricciRecommenderSystemsHandbook2022]. The development of these systems as an area of formal study harkens back to information retrieval (e.g. @saltonIntroductionModernInformation1987) and foundational works imagining the role of computing in human decision-making (e.g. @bushWeMayThink1945). Early work on these systems produced more effective ways of filtering and sorting documents in searches such as the probabilistic models that motivated the creation of the okapi (BM25) relevance function [@robertsonProbabilisticRelevanceFramework2009]. Many contemporary recommendation systems use collaborative filtering, a technique which produces new recommendations for items based on the preferences of a collection of similar users [@korenAdvancesCollaborativeFiltering2022].

-Collaborative filtering systems build on top of a user-item-rating ($U-I-r$) model where there is a set of users who each provide ratings for a set of items. The system then uses the ratings from other users to predict the ratings of a user for an item they have not yet rated and uses these predictions to create a ordered list of the best recommendations for the user's needs [@ekstrandCollaborativeFilteringRecommender2011 pp. 86-87]. Collaborative filtering recommender systems typically produce better results as the number of users and items in the system increases; however, they must must also deal with the "cold start" problem, where limited data makes recommendations unviable [@lamAddressingColdstartProblem2008]. The cold start problem has three possible facets: boostrapping new communities, dealing with new items, and handling new users [@schaferCollaborativeFilteringRecommender2007 pp. 311-312]. In each case, limited data on the entity makes it impossible to find similar entities without some way of building a profile. Further, uncorrected collaborative filtering techniques often also produce a bias where more broadly popular items receive more recommendations than more obscure but possibly more relevant items [@zhuPopularityOpportunityBiasCollaborative2021]. Research on collaborative filtering has also shown that the quality of recommendations can be improved by using a combination of user-based and item-based collaborative filtering [@sarwarItembasedCollaborativeFiltering2001]. <!-- TODO: check this -->
+Collaborative filtering systems build on top of a user-item-rating ($U-I-r$) model where there is a set of users who each provide ratings for a set of items. The system then uses the ratings from other users to predict the ratings of a user for an item they have not yet rated and uses these predictions to create a ordered list of the best recommendations for the user's needs [@ekstrandCollaborativeFilteringRecommender2011 pp. 86-87]. Collaborative filtering recommender systems typically produce better results as the number of users and items in the system increases; however, they must also deal with the "cold start" problem, where limited data makes recommendations unviable [@lamAddressingColdstartProblem2008]. The cold start problem has three possible facets: boostrapping new communities, dealing with new items, and handling new users [@schaferCollaborativeFilteringRecommender2007 pp. 311-312]. In each case, limited data on the entity makes it impossible to find similar entities without some way of building a profile. Further, uncorrected collaborative filtering techniques often also produce a bias where more broadly popular items receive more recommendations than more obscure but possibly more relevant items [@zhuPopularityOpportunityBiasCollaborative2021]. Research on collaborative filtering has also shown that the quality of recommendations can be improved by using a combination of user-based and item-based collaborative filtering [@sarwarItembasedCollaborativeFiltering2001]. <!-- TODO: check this -->

 Although all forms of collaborative filtering use some combination of users and items, there are two main approaches to collaborative filtering: memory-based and model-based. Memory-based approaches use the entire user-item matrix to make recommendations, while model-based approaches use a reduced form of the matrix to make recommendations. This is particularly useful because the matrix of items and users tends to be extremely sparse, e.g. in a movie recommendor system, most people have not seen most of the movies in the database. Singular value decomposition (SVD) is one such dimension reduction technique which transforms a $m \times n$ matrix $M$ into the form $M = U \Sigma V^{T}$ [@paterekImprovingRegularizedSingular2007]. SVD is particularly useful for recommendation systems because it can be used to find the latent factors which underlie the user-item matrix and use these factors to make recommendations.

-While researchers in the recommendation system space often focus on ways to design the system to produce good results mathematically, human-computer interaction researchers also consider various human factors which contribute to the overall system. Crucially, McNee et al. argued “being accurate is not enough”: user-centric evaluations, which consider multiple aspects of the user experience, are necessary to evaluate the full system. HCI researchers have also contributed pioneering recommender systems in practice. For example, GroupLens researchers @resnickGrouplensOpenArchitecture1994 craeted a collaborative filtering systems for Usenet and later produced advancements toward system evalulation and explaination of movie recommendations [@herlockerEvaluatingCollaborativeFiltering2004; @herlockerExplainingCollaborativeFiltering2000]. @cosleySuggestBotUsingIntelligent2007 created a system to match people with tasks on Wikipedia to encourage more editing. This prior work shows that recommender systems can be used to help users find relevant information in a variety of contexts.
+While researchers in the recommendation system space often focus on ways to design the system to produce good results mathematically, human-computer interaction researchers also consider various human factors which contribute to the overall system. Crucially, McNee et al. argued “being accurate is not enough”: user-centric evaluations, which consider multiple aspects of the user experience, are necessary to evaluate the full system. HCI researchers have also contributed pioneering recommender systems in practice. For example, GroupLens researchers @resnickGrouplensOpenArchitecture1994 created a collaborative filtering systems for Usenet and later produced advancements toward system evaluation and explaination of movie recommendations [@herlockerEvaluatingCollaborativeFiltering2004; @herlockerExplainingCollaborativeFiltering2000]. @cosleySuggestBotUsingIntelligent2007 created a system to match people with tasks on Wikipedia to encourage more editing. This prior work shows that recommender systems can be used to help users find relevant information in a variety of contexts.


 ## Evaluation of Recommendation Systems
@ -322,8 +332,6 @@ We again find that accounts on the largest general instances are less likely to
 First, we calculate a continuous measure for the generality of the server based on the item-item similarity between servers. We then use this measure to predict whether an account will remain active after 91 days using a logistic regression model.

 ```{r}
-library(modelsummary)
-
 modelsummary(logit)
 ```

@ -345,15 +353,16 @@ Mastodon users can move their accounts to another server while retaining their c
 library(here)
 library(modelsummary)
 library(kableExtra)
+library(tinytable)
 library(purrr)
 library(stringr)
 load(file = here("data/scratch/ergm-model-early.rda"))
 load(file = here("data/scratch/ergm-model-late.rda"))

 if (knitr::is_latex_output()) {
-  format <- "latex_tabular"
+  my_format <- "latex_tabular"
 } else {
-  format <- "html"
+  my_format <- "html"
 }

 x <- modelsummary(
@ -371,8 +380,8 @@ x <- modelsummary(
  ),
  align="lrrrr",
  stars = c('*' = .05, '**' = 0.01, '***' = .001),
-  output = format
-  ) %>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
+  output = my_format
+  ) |> group_tt(j = list("Model A" = 2:3, "Model B" = 4:5))
 ```

 :::: {#tbl-ergm-table `r class_wide`}
@ -399,7 +408,7 @@ Based on these findings, we suggest a need for better ways for newcomers to find

 The decentralized web presents unique challenges for recommendation systems. Centralized recommendation systems can collect data from all users and use this data to make recommendations. However, this is less desirable on the decentralized web, where data is spread across many servers and users may not want to share their data with a central authority. Instead, I propose a system where servers can report the top hashtags by the number of unique accounts on the server using them during the last three months. Such a system would be opt-in and require few additional server resources since tags already have their own database table. Because each server only reports aggregated counts of publicly posted hashtags, this also reduces the risk of privacy violations.

-In the Mastodon context, the cold start problem has two possible facets: there is no information on new servers and there is also no information on new users. New servers are thus likely prone to falling for popularity bias: there is simply more data on larger servers. A common strategy to deal with new users is to ask for some intitial preferences to create an initial workable user profile. In the case of this system, we ask the user to provide a set of tags which they are interested in. We then use these tags to find the top servers which match these tags.
+In the Mastodon context, the cold start problem has two possible facets: there is no information on new servers and there is also no information on new users. New servers are thus likely prone to falling for popularity bias: there is simply more data on larger servers. A common strategy to deal with new users is to ask for some initial preferences to create an initial workable user profile. In the case of this system, we ask the user to provide a set of tags which they are interested in. We then use these tags to find the top servers which match these tags.


 I plan to evaluate the system in part using the accounts which moved between servers. Based on their posting history (e.g. hashtags), can the recommendations system predict where they will move to?
@ -429,7 +438,7 @@ $$
 where $N$ is the total number of servers and $n$ is the number of servers where the tag appears as one of the top tags. We then apply L2 normalization:

 $$
-tfidf = \frac{tf \cdot idf}{\| tf \cdot idf \|_2}
+tf \cdot idf = \frac{tf \cdot idf}{\| tf \cdot idf \|_2}
 $$


@ -483,6 +492,8 @@ Top five servers most similar to hci.social
 We also calculate the similarity between tags using the same method. This can be used to suggest related tags to users based on their interests.

 ```{r}
+#| eval: false
+
 #| fig-cap: "100 popular hashtags visualized in two dimensions using a principal component analysis (PCA) on the transformed singular value decomposition (SVD) matrix."
 library(tidyverse)
 library(arrow)
@ -518,15 +529,29 @@ Given a set of popular tags and a list of servers, we build a recommendation sys

 For evaluation, we plan to use data from posts on accounts during a different time period from the one we used to train the recommender system. The goal of the system is to suggest the best servers for these accounts.

+```{r}
+library(tidyverse)
+library(arrow)
+
+# Create a histogram
+recc_evals <- arrow::read_ipc_file("data/scratch/svd50_eval.feather") 
+recc_evals %>%
+  ggplot(aes(x=svd)) +
+  geom_histogram(binwidth = 5) + 
+  labs(title = "Distribution of SVD Ranks for Server Recommendations", x = "Rank", y = "Count")
+```
+
+The SVD system predicts the server with a median rank of `r median(recc_evals$svd)` and a mean rank of `r round(mean(recc_evals$svd))`.
+
 #### Movement-based

-In parallel with the analysis of server survival, we also take an interest in users who moved servers since we can assume that these users found a server they liked better than their original server. We can use the recommender system to predict where these users will move to and use these predictions to evaluate the sytem.
+In parallel with the analysis of server survival, we also take an interest in users who moved servers since we can assume that these users found a server they liked better than their original server. We can use the recommender system to predict where these users will move to and use these predictions to evaluate the system.

 ### Online Evaluation

-_I have also given some thought to online evaluation. Could we use an aleternative version of the front-end to produce recomemndations for interesting servers from existing accounts?_
+_I have also given some thought to online evaluation. Could we use an aleternative version of the front-end to produce recommendations for interesting servers from existing accounts?_

-### Rubustness to Limited Data
+### Robustness to Limited Data

 ```{r}
 #| label: fig-simulations-rbo
@ -617,8 +642,8 @@ library(ggrepel)

 We also illustrate the potential value of such a system with three user stories:

-**User Story 1**: Juan is a human-computer interaction researcher looking for a server to connect with colleagues and also share about his projects. He is interested in finding a server with a focus on research and technology. Juan inputs the tags "research", "academia", and "technology" into the system and receives a list of servers which match his interests: `sciences.social`, `mathstodon.xyz`, `mas.to`, `synapse.cafe`.
+**User Story 1**: Juan is a human-computer interaction researcher looking for a server to connect with colleagues and also share about his projects. He is interested in finding a server with a focus on research and technology. Juan inputs the tags "research", "academia", and "technology" into the system and receives a list of servers which match his interests:  `synapse.cafe`, `sciences.social`, `mathstodon.xyz`, `mastodon.social`, `mastodon.education`.

-**User Story 2** (Arthur) just wants to connect with friends and family. For some reason, Arthur clicks every single major category and gets the suggestions: `mas.to`, `mstdn.social`, `mastodon.world`, `mastodon.social`.
+**User Story 2** (Arthur) just wants to connect with friends and family. For some reason, Arthur clicks every single major category and gets the suggestions: `mas.to`, `library.love`, `mastodon.world`, `mstdn.social`.

-**User Story 3** (Tracy) has run a niche fandom blog on Tumblr for the last eight years and is curious about migrating to the Fediverse. She inputs the tags "doctorwho", "fanart", and "fanfiction" and gets the suggestions: `blorbo.social`, `sakurajima.moe`, `toot.kif.rocks`, `socel.net`.
+**User Story 3** (Tracy) has run a niche fandom blog on Tumblr for the last eight years and is curious about migrating to the Fediverse. She inputs the tags "doctorwho", "fanart", and "fanfiction" and gets the suggestions: `blorbo.social`, `mastodon.nz`, `sakurajima.moe`, `toot.kif.rocks`, `mastodon.scot`.
--- a/junior-sheer.Rproj
+++ b/junior-sheer.Rproj
@ -11,3 +11,5 @@ Encoding: UTF-8

 RnwWeave: knitr
 LaTeX: pdfLaTeX
+
+BuildType: Makefile
--- a/manuscripts/deweb/.gitignore
+++ b/manuscripts/deweb/.gitignore
@ -0,0 +1 @@
+/.quarto/
--- a/manuscripts/deweb/README.md
+++ b/manuscripts/deweb/README.md
@ -0,0 +1,3 @@
+# DeWeb Workshop Submission
+
+This manuscript was submitted and accepted to the [1st International Workshop On Decentralizing The Web](https://deweb-workshop.github.io/) at ICWSM 2024.
--- a/manuscripts/deweb/_quarto.yml
+++ b/manuscripts/deweb/_quarto.yml
@ -0,0 +1,14 @@
+project:
+  type: manuscript
+  #render:
+  #- article.qmd
+manuscript:
+  article: index.qmd
+  #environment: renv.lock
+execute:
+  echo: false
+  error: false
+  warning: false
+  message: false
+  freeze: auto
+  #cache: true
--- a/manuscripts/deweb/aaai24.bst
+++ b/manuscripts/deweb/aaai24.bst
--- a/manuscripts/deweb/aaai24.sty
+++ b/manuscripts/deweb/aaai24.sty
--- a/manuscripts/deweb/index.qmd
+++ b/manuscripts/deweb/index.qmd
@ -0,0 +1,320 @@
+---
+title: "Do Servers Matter on Mastodon? Data-driven Design for Decentralized Social Media"
+short-title: Mastodon Recommendations
+authors:
+  - name: Carl Colglazier
+    affiliation:
+      name: Northwestern University
+      city: Evanston
+      state: Illinois
+      country: United States
+    corresponding: true
+bibliography: ../../references.bib
+pdf-engine: pdflatex
+format:
+  pdf:
+    output-file: mastodon-recommendations-icwsm.pdf
+    fig-pos: 'ht!bp'
+    cite-method: natbib
+    template: template.tex
+    keep-md: true
+    link-citations: false
+abstract: |
+  When trying to join Mastodon, a decentralized collection of interoperable social networking servers, new users face the dilemma of choosing a home server. Using trace data from millions of new Mastodon accounts, we show that new accounts are less likely to remain active on the network's largest general instances compared to others. Additionally, we observe a trend of users migrating from larger to smaller servers. Addressing the challenge of onboarding and server selection, the paper proposes a decentralized recommendation system for server using hashtags and the Okapi BM25 algorithm. This system leverages servers' top hashtags and their frequency to create a recommendation mechanism that respects Mastodon's decentralized ethos. Simulations demonstrate that such a tool can be effective even with limited data on each local server.
+execute:
+  echo: false
+  error: false
+  warning: false
+  message: false
+  freeze: true
+  cache: false
+fig-width: 6.75
+knitr:
+  opts_knit: 
+    verbose: true
+---
+
+```{r}
+#| label: setup
+
+profile <- Sys.getenv("QUARTO_PROFILE", unset="acm")
+if (profile == "acm") {
+  class_wide <- ".column-body"
+} else {
+  class_wide <- ".column-page"
+}
+
+envs <- Sys.getenv()
+```
+
+# Introduction
+
+Following Twitter's 2022 acquisition, Mastodon---an open-source, decentralized social network and microblogging community---saw an increase in activity and attention as a potential Twitter alternative [@heFlockingMastodonTracking2023; @cavaDriversSocialInfluence2023]. While millions of people set up new accounts and significantly increased the size of the network, many of these newcomers and potential newcomers found the process confusing and many accounts did not remain active. Unlike centralized social media platforms, Mastodon is a network of independent servers with their own rules and norms [@nicholsonMastodonRulesCharacterizing2023]. Each server can communicate with each other using the shared ActivityPub protocols and accounts can move between Mastodon servers, but the local experience can vary widely from server to server.
+
+Although attracting and retaining newcomers is a key challenge for online communities [@krautBuildingSuccessfulOnline2011 p. 182], Mastodon's onboarding process has not always been straightforward. Variation among servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. Further, many Mastodon servers have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Various guides and resources for people trying to join Mastodon offered mixed advice on choosing a server. Some suggest that the most important thing is to simply join any server and work from there [@krasnoffMastodon101How2022; @silberlingBeginnerGuideMastodon2023], while others have created tools and guides to help people find potential servers of interest by size and location[@thekinrarMastodonInstances; @kingMastodonMe2024].
+
+Mastodon's decentralized design has long been in tension with the disproportionate popularity of a small set of large, general-topic servers within the system [@ramanChallengesDecentralisedWeb2019a]. Analysing the activity of new accounts that join the network, we find that users who sign up on such servers are less likely to remain active after 91 days. We also find that many users who move accounts tend to gravitate toward smaller, more niche servers over time, suggesting that established users may also find additional utility from such servers.
+
+In response to these findings, we propose a potential way to create server and tag recommendations on Mastodon. This recommendation system could both help newcomers find servers that match their interests and help established accounts discover "neighborhoods" of related servers.
+
+# Background
+
+## Empirical Setting
+
+The Fediverse is a set of decentralized online social networks which interoperate using shared protocols like ActivityPub. Mastodon is a software program used by many Fediverse servers and offers a user experience similar to the Tweetdeck client for Twitter. It was first created in late 2016 and saw a surge in interest in 2022 during and after Elon Musk's Twitter acquisition.
+
+Mastodon features three kinds of timelines. The primary timelines is a "home" timeline which shows all posts from accounts followed by the user. Mastodon also supports a "local" timeline which shows all public posts from the local server and a "federated" timeline which includes all posts from users followed by other users on their server. The local timeline is unique to each server and can be used to discover new accounts and posts from the local community. On larger servers, this timeline can be unwieldy; however, on smaller servers, this presents the opportunity to discover new posts and users of potential interest.
+
+Discovery has been challenging on Masotodon. Text search, for instance, was impossible on most servers until support for this feature was added on an optional, opt-in basis using Elasticsearch in late 2023 [@rochkoMastodon2023]. Recommendation systems are currently a somewhat novel problem in the context of decentralized online social networks. @trienesRecommendingUsersWhom2018 developed a recommendation system for finding new accounts to follow on the Fediverse which used collaborative filtering based on BM25 in an early example of a content discovery system on Mastodon.
+
+Individual Mastodon servers can have an effect on the end experience of users. For example, some servers may choose to federate with some servers but not others, altering the topology of the Fediverse network for their users. At the same time, accounts need to be locked into one specific server. Because of Mastodon's data portability, users can move their accounts freely between servers while retaining their followers, though their post history remains with their original account.
+
+## The Mastodon Migrations
+
+Mastodon saw a surge in interest in 2022 and 2023, particularly after Elon Musk's Twitter acquisition. In particular, four events of interests drove measurable increases in new users to the network: the announcement of the acquisition (April 14, 2022), the closing of the acquisition (October 27, 2022), a day when Twitter suspended a number of prominent journalists (December 15, 2022), and a day when Twitter experienced an outage and started rate limiting accounts (July 1, 2023). Many Twitter accounts announced they were setting up Mastodon accounts and linked their new accounts to their followers, often using tags like #TwitterMigration [@heFlockingMastodonTracking2023] and driving interest in Mastodon in a process @cavaDriversSocialInfluence2023 found consistent with social influence theory.
+
+Some media outlets have framed reports on Mastodon [@hooverMastodonBumpNow2023] through what @zulliRethinkingSocialSocial2020 calls the "Killer Hype Cycle", whereby the media finds a new alternative social media platform, declares it a potential killer of some established platform, and later calls it a failure if it does not displace the existing platform. Such framing fails to take systems like the Fediverse seriously for their own merits: completely replacing existing commercial systems is not the only way to measure success, nor does it account for the real value the Fediverse provides for its millions of active users.
+
+Mastodon's approach to onboarding has also changed over time. In much of 2020 and early 2021, the Mastodon developers closed sign-ups to their flagship server and linked to an alternative server, which saw increased sign-ups during this period. They also linked to a list of servers on the "Join Mastodon" webpage [@mastodonggmbhServers], where all servers are pre-approved and follow the Mastodon Server Covenant which guarantees certain content moderation standards and data protections. Starting in 2023, the Mastodon developers shifted toward making the flagship server the default when people sign up on the official Mastodon Android and iOS apps [@rochkoNewOnboardingExperience2023; @rothItGettingEasier2023].
+
+## Newcomers in Online Communities
+
+Onboarding newcomers is an important part of the life cycle of online communities. Any community can expect a certain amount of turnover, and so it is important for the long-term health and longevity of the community to be able to bring in new members [@krautBuildingSuccessfulOnline2011 p. 182]. However, the process of onboarding newcomers is not always straightforward.
+
+The series of migrations of new users into Mastodon in many ways reflect folk stories of "Eternal Septembers" on previous communication networks, where a large influx of newcomers challenged the existing norms [@driscollWeMisrememberEternal2023; @kieneSurvivingEternalSeptember2016]. Many Mastodon servers do have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Variation among servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. Mastodon servers open to new accounts must thus be both accommodating to newcomers while at the same ensuring the propagation of their norms and culture, either through social norms or through technical means.
+
+# Data
+
+```{r}
+#| label: fig-account-timeline
+#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
+#| fig-height: 2.75
+#| fig-width: 6.75
+#| fig-env: figure*
+## #| fig-pos: tb!
+
+library(here)
+source(here("codebase/R/helpers.R"))
+account_timeline_plot()
+```
+
+Mastodon has an extensive API which allows for the collection of public posts and account information. We collected data from the public timelines of Mastodon servers using the Mastodon API with a crawler which runs once per day. We also collected account information from the opt-in public profile directories on these servers.
+
+```{r}
+#| label: data-counts
+#| cache: true
+
+library(arrow)
+library(tidyverse)
+library(here)
+source(here("codebase/R/helpers.R"))
+
+accounts <- load_accounts(filt = FALSE) %>%
+  filter(created_at >= "2020-08-14") %>%
+  filter(created_at < "2024-01-01")
+
+tag_posts <- "data/scratch/all_tag_posts.feather" %>% 
+  here::here() %>%
+  arrow::read_ipc_file(. , col_select = c("host", "acct", "created_at")) %>%
+  filter(created_at >= as.Date("2023-05-01")) %>%
+  filter(created_at < as.Date("2023-08-01"))
+
+text_format <- function(df) {
+  return (format(nrow(df), big.mark=","))
+}
+
+num_tag_posts <- tag_posts %>% text_format()
+num_tag_accounts <- tag_posts %>% distinct(host, acct) %>% text_format()
+num_tag_servers <- tag_posts %>% distinct(host) %>% text_format()
+
+num_accounts_unfilt <- accounts %>% text_format()
+num_account_bots <- accounts %>% filter(bot) %>% text_format()
+num_account_nostatuses <- accounts %>% filter(is.na(last_status_at)) %>% text_format()
+num_account_suspended <- accounts %>% mutate(suspended = replace_na(suspended, FALSE)) %>% filter(suspended) %>% text_format()
+num_accounts_moved <- accounts %>% filter(has_moved) %>% text_format()
+num_account_limited <- accounts %>% filter(limited) %>% text_format()
+num_account_samedaystatus <- accounts %>% filter(last_status_at <= created_at) %>% text_format()
+num_account_filt <- load_accounts(filt = TRUE) %>% text_format()
+```
+
+**Mastodon Profiles**: We collected accounts using data previously collected from posts on public Mastodon timelines from October 2020 to August 2023. We then queried for up-to-date information on those accounts including their most recent status and if the account had moved as of February 2024. Through this process, we discovered a total of `r num_accounts_unfilt` account created between August 14, 2020 and January 1, 2024. We then filtered out accounts which were bots (`r num_account_bots` accounts), had been suspended (`r num_account_suspended` accounts), had been marked as moved to another account (`r num_accounts_moved` accounts), had been limited by their local server (`r num_account_limited` accounts), had no statuses (`r num_account_nostatuses` accounts), or had posted their last status on the same day as their account creation (`r num_account_samedaystatus` accounts). This gave us a total of `r num_account_filt` accounts which met all the filtering criteria. Note that because we got updated information on each account, we include only accounts on servers which still existed at the time of our profile queries and which returned records for the account.
+
+**Tags**: Mastodon supports hashtags, which are user-generated metadata tags that can be added to posts. Clicking the link for a tag shows a stream of posts which also have that tag from the federated timeline, which includes accounts on the same server and posts from accounts followed by the accounts on the local server. We collected `r num_tag_posts` statuses posted by `r num_tag_accounts` accounts on `r num_tag_servers` unique servers from between May to July 2023 which contained at least one hashtag.
+
+# Analysis and Results
+
+## Survival Model
+
+*Are accounts on suggested general servers less likely to remain active than accounts on other servers?*
+
+```{r, cache.extra = tools::md5sum("codebase/R/survival.R")}
+#| cache: true
+#| label: fig-survival
+#| fig-env: figure
+#| fig-cap: "Survival probabilities for accounts created during May 2023."
+#| fig-width: 3.375
+#| fig-height: 2.5
+#| fig-pos: h!
+
+library(here)
+source(here("codebase/R/survival.R"))
+plot_km 
+```
+
+```{r}
+#| label: table-coxme
+library(ehahelper)
+library(broom)
+
+cxme_table <- tidy(cxme) %>%
+  mutate(conf.low = exp(conf.low), conf.high=exp(conf.high)) %>%
+  mutate(term = case_when(
+    term == "factor(group)1" ~ "Join Mastodon",
+    term == "factor(group)2" ~ "General Servers",
+    term == "small_serverTRUE" ~ "Small Server",
+    TRUE ~ term
+  )) %>%
+  mutate(exp.coef = paste("(", round(conf.low, 2), ", ", round(conf.high, 2), ")", sep="")) %>%
+  select(term, estimate, exp.coef , p.value)
+```
+
+Using `r text_format(sel_a)` accounts created from May 1 to June 30, 2023, we create a Kaplan–Meier estimator for the probability that an account will remain active based on whether the account is on one of the largest general instances [^1] featured at the top of the Join Mastodon webpage or otherwise if it is on a server in the Join Mastodon list. Accounts are considered active if they have made at least one post after the censorship period `r active_period` days after account creation.
+
+[^1]: `r paste(general_servers, collapse=", ")`
+
+## Moved Accounts
+
+*Do accounts tend to move to larger or smaller servers?*
+
+Mastodon users can move their accounts to another server while retaining their connections (but not their posts) to other Mastodon accounts. This feature, built into the Mastodon software, offers data portability and helps avoid lock-in.
+
+```{r}
+#| label: tbl-ergm-table
+#| echo: false
+#| warning: false
+#| message: false
+#| error: false
+#| tbl-cap: "Exponential family random graph models for account movement between Mastodon servers. Accounts in Model A were created in May 2022 and moved to another account at some later point. Accounts in Model B were created at some earlier point and moved after October 2023."
+#| eval: false
+
+library(here)
+library(modelsummary)
+library(kableExtra)
+library(purrr)
+library(stringr)
+load(file = here("data/scratch/ergm-model-early.rda"))
+load(file = here("data/scratch/ergm-model-late.rda"))
+
+if (knitr::is_latex_output()) {
+  format <- "latex_tabular"
+} else {
+  format <- "html"
+}
+
+modelsummary(
+  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
+  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
+  statistic = NULL,
+  gof_omit = ".*",
+  coef_rename = c(
+    "sum" = "Sum",
+    "nonzero" = "Nonzero",
+    "diff.sum0.h-t.accounts" = "Smaller server",
+    "nodeocov.sum.accounts" = "Server size\n(outgoing)",
+    "nodeifactor.sum.registrations.TRUE" = "Open registrations\n(incoming)",
+    "nodematch.sum.language" = "Languages match"
+  ),
+  align="lrrrr",
+  stars = c('*' = .05, '**' = 0.01, '***' = .001),
+  output = "latex"
+  ) #%>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
+```
+
+
+To corroborate our findings, we also use data from thousands of accounts which moved between Mastodon servers, taking advantage of the data portability of the platform. Conceiving of these moved accounts as edges within a weighted directional network where nodes represent servers, edges represent accounts, and weights represent the number of accounts that moved between servers, we construct an exponential family random graph model (ERGM) with terms for server size, open registrations, and language match between servers. We find that accounts are more likely to move from larger servers to smaller servers.
+
+
+# Proposed Recommendation System
+
+*How can we build an opt-in, low-resource recommendation system for finding Fediverse servers?*
+
+Based on these findings, we suggest a need for better ways for newcomers to find servers and propose a viable way to create server and tag recommendations on Mastodon. This system could both help newcomers find servers that match their interests and help established accounts discover "neighborhoods" of related servers.
+
+One challenge in building such a system is the decentralized nature of the system. A single, central actor which collects data from servers and then distributes recommendations would be antithetical to the decentralized nature of Mastodon. Instead, we propose a system where servers can report the top hashtags by the number of unique accounts on the server using them during the last three months. Such a system would be opt-in and require few additional server resources since tags already have their own database table.
+
+## Recommendation System Design
+
+We use Okapi BM25 to construct a term frequency-inverse document frequency (TF-IDF) model to associate the top tags with each server using counts of tag-account pairs from each server for the term frequency and the number of servers that use each tag for the inverse document frequency. We then L2 normalize the vectors for each tag and calculate the cosine similarity between the tag vectors for each server.
+
+$$
+tf = \frac{f_{t,s} \cdot (k_1 + 1)}{f_{t,s} + k_1 \cdot (1 - b + b \cdot \frac{|s|}{avgstl})}
+$$
+
+where $f_{t,s}$ is the number of accounts using the tag $t$ on server $d$, $k_1$ and $b$ are tuning parameters, and $avgstl$ is the average sum of account-tag pairs. For the inverse document frequency, we use the following formula: 
+
+$$
+idf = \log \frac{N - n + 0.5}{n + 0.5}
+$$
+
+where $N$ is the total number of servers and $n$ is the number of servers where the tag appears as one of the top tags. We then apply L2 normalization:
+
+$$
+tfidf = \frac{tf \cdot idf}{\| tf \cdot idf \|_2}
+$$
+
+## Applications
+
+```{r}
+#| eval: false
+library(tidyverse)
+library(igraph)
+library(arrow)
+
+sim_servers <- "data/scratch/server_similarity.feather" %>% 
+  here::here() %>%
+  arrow::read_ipc_file() %>% rename("weight" = "Similarity")
+#sim_net <- as.network(sim_servers)
+g <- graph_from_data_frame(sim_servers, directed = FALSE)
+
+g_strength <- log(sort(strength(g)))
+normalized_strength <- (g_strength - min(g_strength)) / (max(g_strength) - min(g_strength))
+
+server_centrality <- enframe(normalized_strength, name="server", value="strength")
+server_centrality %>% arrow::write_ipc_file("data/scratch/server_centrality.feather")
+```
+
+### Server Similarity Neighborhoods
+
+Mastodon provides two feeds in addition to a user's home timeline populated by accounts they follow: a local timeline with all public posts from their local server and a federated timeline which includes all posts from users followed by other users on their server. We suggest a third kind of timeline, a *neighborhood timeline*, which filters the federated timeline by topic.
+
+We calculate the pairwise similarity between two servers with TF-IDF vectors $A$ and $B$ using cosine similarity:
+
+$$
+\text{similarity}(A, B) = \frac{A \cdot B}{\|A\| \|B\|}
+$$
+
+
+### Server Discovery
+
+Given a set of popular tags and a list of servers, we build a recommendation system where users select tags from a list of popular tags and receive server suggestions. The system first creates a subset of vectors based on the TF-IDF matrix which represents the top clusters of topics. After a user selects the top tags of interest to them, it suggests servers which match their preferences.
+
+### Tag Similarity
+
+We also calculate the similarity between tags using the same method. This can be used to suggest related tags to users based on their interests.
+
+# Discussion
+
+The analysis can also be improved by additionally focusing on factors lead to accounts remaining active or dropping out, which a particular focus on the actual activity of accounts over time. For instance, do accounts that interact with other users more remain active longer? Are there particular markers of activity that are more predictive of account retention? Future work could use these to provide suggests for ways to helps newcomers during the onboarding process.
+
+The observational nature of the data limit some of the causal claims we can make. It is unclear, for instance, if accounts on general servers are less likely to remain active because of the server itself or because of the type of users who join such servers. For example, it is conceivable that the kind of person who spends more time researching which server to join is more invested in their Mastodon experience than one who simply joins the first server they find.
+
+Future work is necessary to determine the how well the recommendation system is at helping users find servers that match their interests. This may involve user studies and interviews to determine how well the system works in practice.
+
+While the work presented here is based on observed posts on the public timelines, simulations may be helpful in determining the robustness of the system to targeted attacks. Due to the decentralized nature of the system, it is feasible that a bad actor could set up zombie accounts on servers to manipulate the recommendation system. Simulations could help determine how well the system can resist such attacks and ways to mitigate this risk.
+
+# Conclusion
+
+Based on analysis of trace data from millions of new Fediverse accounts, we find evidence that suggests that servers matter and that users tend to move from larger servers to smaller servers. We then propose a recommendation system that can help new Fediverse users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effectively deployed in a federated manner, even with limited data on each local server.
+
+# References {#references}
+
--- a/manuscripts/deweb/references.bib
+++ b/manuscripts/deweb/references.bib
@ -0,0 +1,719 @@
+@book{abbateInventingInternet2000,
+  title = {Inventing the {{Internet}}},
+  author = {Abbate, Janet},
+  year = {2000},
+  series = {Inside Technology},
+  edition = {3rd printing},
+  publisher = {MIT Press},
+  address = {Cambridge, Mass.},
+  isbn = {978-0-262-51115-5},
+  langid = {english}
+}
+
+@misc{AmericansWidelyDistrust2021,
+  title = {Americans Widely Distrust {{Facebook}}, {{TikTok}} and {{Instagram}} with Their Data, Poll Finds},
+  year = {2021},
+  month = dec,
+  journal = {Washington Post},
+  urldate = {2024-03-09},
+  abstract = {Pulled between not trusting some tech companies and still wanting to use their products, people look to government regulation.},
+  chapter = {Technology},
+  howpublished = {https://www.washingtonpost.com/technology/2021/12/22/tech-trust-survey/},
+  langid = {english}
+}
+
+@article{baranDistributedCommunicationsNetworks1964,
+  title = {On {{Distributed Communications Networks}}},
+  author = {Baran, P.},
+  year = {1964},
+  month = mar,
+  journal = {IEEE Transactions on Communications Systems},
+  volume = {12},
+  number = {1},
+  pages = {1--9},
+  issn = {1558-2647},
+  doi = {10.1109/TCOM.1964.1088883},
+  abstract = {This paper briefly reviews the distributed communication network concept in which each station is connected to all adjacent stations rather than to a few switching points, as in a centralized system. The payoff for a distributed configuration in terms of survivability in the cases of enemy attack directed against nodes, links or combinations of nodes and links is demonstrated. A comparison is made between diversity of assignment and perfect switching in distributed networks, and the feasibility of using low-cost unreliable communication links, even links so unreliable as to be unusable in present type networks, to form highly reliable networks is discussed. The requirements for a future all-digital data distributed network which provides common user service for a wide range of users having different requirements is considered. The use of a standard format message block permits building relatively simple switching mechanisms using an adaptive store-and-forward routing policy to handle all forms of digital data including digital voice. This network rapidly responds to changes in the network status. Recent history of measured network traffic is used to modify path selection. Simulation results are shown to indicate that highly efficient routing can be performed by local control without the necessity for any central, and therefore vulnerable, control point.},
+  keywords = {Buildings,Centralized control,Communication networks,Communication switching,Communication system control,History,Information systems,Network synthesis,Routing,Telecommunication network reliability}
+}
+
+@inproceedings{burkeFeedMeMotivating2009,
+  title = {Feed {{Me}}: {{Motivating Newcomer Contribution}} in {{Social Network Sites}}},
+  shorttitle = {Feed {{Me}}},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  author = {Burke, Moira and Marlow, Cameron and Lento, Thomas},
+  year = {2009},
+  series = {{{CHI}} '09},
+  pages = {945--954},
+  publisher = {ACM},
+  address = {New York, NY, USA},
+  doi = {10.1145/1518701.1518847},
+  urldate = {2017-08-02},
+  abstract = {Social networking sites (SNS) are only as good as the content their users share. Therefore, designers of SNS seek to improve the overall user experience by encouraging members to contribute more content. However, user motivations for contribution in SNS are not well understood. This is particularly true for newcomers, who may not recognize the value of contribution. Using server log data from approximately 140,000 newcomers in Facebook, we predict long-term sharing based on the experiences the newcomers have in their first two weeks. We test four mechanisms: social learning, singling out, feedback, and distribution. In particular, we find support for social learning: newcomers who see their friends contributing go on to share more content themselves. For newcomers who are initially inclined to contribute, receiving feedback and having a wide audience are also predictors of increased sharing. On the other hand, singling out appears to affect only those newcomers who are not initially inclined to share. The paper concludes with design implications for motivating newcomer sharing in online communities.},
+  isbn = {978-1-60558-246-7}
+}
+
+@article{bushWeMayThink1945,
+  title = {As {{We May Think}}},
+  author = {Bush, Vannevar},
+  year = {1945},
+  month = jul,
+  journal = {The Atlantic},
+  volume = {176},
+  number = {1},
+  pages = {101--108},
+  urldate = {2020-03-04},
+  abstract = {``Consider a future device {\dots}~~in which an individual stores all his books, records, and communications, and which is mechanized so that it may be consulted with exceeding speed and flexibility. It is an enlarged intimate supplement to his memory.''},
+  langid = {american}
+}
+
+@article{colglazierEffectsGroupSanctions2024,
+  title = {The {{Effects}} of {{Group Sanctions}} on {{Participation}} and {{Toxicity}}: {{Quasi-experimental Evidence}} from the {{Fediverse}}},
+  shorttitle = {The {{Effects}} of {{Group Sanctions}} on {{Participation}} and {{Toxicity}}},
+  author = {Colglazier, Carl and TeBlunthuis, Nathan and Shaw, Aaron},
+  year = {2024},
+  month = may,
+  journal = {Proceedings of the International AAAI Conference on Web and Social Media},
+  volume = {18},
+  pages = {315--328},
+  issn = {2334-0770},
+  doi = {10.1609/icwsm.v18i1.31316},
+  urldate = {2024-06-02},
+  abstract = {Online communities often overlap and coexist, despite incongruent norms and approaches to content moderation. When communities diverge, decentralized and federated communities may pursue group-level sanctions, including defederation (disconnection) to block communication between members of specific communities. We investigate the effects of defederation in the context of the Fediverse, a set of decentralized, interconnected social networks with independent governance. Mastodon and Pleroma, the most popular software powering the Fediverse, allow administrators on one server to defederate from another. We use a difference-in-differences approach and matched controls to estimate the effects of defederation events on participation and message toxicity among affected members of the blocked and blocking servers. We find that defederation causes a drop in activity for accounts on the blocked servers, but not on the blocking servers. Also, we find no evidence of an effect of defederation on message toxicity.},
+  copyright = {Copyright (c) 2024 Association for the Advancement of Artificial Intelligence},
+  langid = {english}
+}
+
+@inproceedings{cosleySuggestBotUsingIntelligent2007,
+  title = {{{SuggestBot}}: {{Using Intelligent Task Routing}} to {{Help People Find Work}} in {{Wikipedia}}},
+  shorttitle = {{{SuggestBot}}},
+  booktitle = {Proceedings of the 12th {{International Conference}} on {{Intelligent User Interfaces}}},
+  author = {Cosley, Dan and Frankowski, Dan and Terveen, Loren and Riedl, John},
+  year = {2007},
+  series = {{{IUI}} '07},
+  pages = {32--41},
+  publisher = {ACM},
+  address = {New York, NY, USA},
+  doi = {10.1145/1216295.1216309},
+  urldate = {2016-05-23},
+  abstract = {Member-maintained communities ask their users to perform tasks the community needs. From Slashdot, to IMDb, to Wikipedia, groups with diverse interests create community-maintained artifacts of lasting value (CALV) that support the group's main purpose and provide value to others. Said communities don't help members find work to do, or do so without regard to individual preferences, such as Slashdot assigning meta-moderation randomly. Yet social science theory suggests that reducing the cost and increasing the personal value of contribution would motivate members to participate more.We present SuggestBot, software that performs intelligent task routing (matching people with tasks) in Wikipedia. SuggestBot uses broadly applicable strategies of text analysis, collaborative filtering, and hyperlink following to recommend tasks. SuggestBot's intelligent task routing increases the number of edits by roughly four times compared to suggesting random articles. Our contributions are: 1) demonstrating the value of intelligent task routing in a real deployment; 2) showing how to do intelligent task routing; and 3) sharing our experience of deploying a tool in Wikipedia, which offered both challenges and opportunities for research.},
+  isbn = {978-1-59593-481-9}
+}
+
+@misc{diazUsingMastodonWay2022,
+  title = {Using {{Mastodon}} Is Way Too Complicated to Ever Topple {{Twitter}}},
+  author = {Diaz, Jesus},
+  year = {2022},
+  month = nov,
+  journal = {Fast Company},
+  urldate = {2024-02-22},
+  abstract = {Great idea in theory, a total pain in practice.},
+  howpublished = {https://www.fastcompany.com/90808984/using-mastodon-is-way-too-complicated-to-ever-topple-twitter},
+  langid = {english}
+}
+
+@book{driscollModemWorldPrehistory2022,
+  title = {The Modem World: {{A}} Prehistory of Social Media},
+  shorttitle = {The Modem World},
+  author = {Driscoll, Kevin},
+  year = {2022},
+  month = apr,
+  publisher = {Yale University Press},
+  abstract = {The untold story about how the internet became social, and why this matters for its future``Whether you're reading this for a nostalgic romp or to understand the dawn of the internet, The Modem World will delight you with tales of BBS culture and shed light on how the decisions of the past shape our current networked world.''---danah boyd, author of It's Complicated: The Social Lives of Networked TeensFifteen years before the commercialization of the internet, millions of amateurs across North America created more than 100,000 small-scale computer networks. The people who built and maintained these dial-up bulletin board systems (BBSs) in the 1980s laid the groundwork for millions of others who would bring their lives online in the 1990s and beyond. From ham radio operators to HIV/AIDS activists, these modem enthusiasts developed novel forms of community moderation, governance, and commercialization. The Modem World tells an alternative origin story for social media, centered not in the office parks of Silicon Valley or the meeting rooms of military contractors, but rather on the online communities of hobbyists, activists, and entrepreneurs. Over time, countless social media platforms have appropriated the social and technical innovations of the BBS community. How can these untold stories from the internet's past inspire more inclusive visions of its future?},
+  isbn = {978-0-300-26512-5},
+  langid = {english},
+  keywords = {Computers / History,Computers / Internet / General,History / Modern / 20th Century / General}
+}
+
+@misc{driscollWeMisrememberEternal2023,
+  title = {Do We Misremember {{Eternal September}}?},
+  shorttitle = {Do We Misremember {{Eternal September}}?},
+  author = {Driscoll, Kevin},
+  year = {2023},
+  month = apr,
+  journal = {FLOW},
+  urldate = {2024-02-22},
+  langid = {american}
+}
+
+@article{ekstrandCollaborativeFilteringRecommender2011,
+  title = {Collaborative {{Filtering Recommender Systems}}},
+  author = {Ekstrand, Michael D. and Riedl, John T. and Konstan, Joseph A.},
+  year = {2011},
+  month = may,
+  journal = {Foundations and Trends{\textregistered} in Human--Computer Interaction},
+  volume = {4},
+  number = {2},
+  pages = {81--173},
+  publisher = {Now Publishers, Inc.},
+  issn = {1551-3955, 1551-3963},
+  doi = {10.1561/1100000009},
+  urldate = {2024-05-21},
+  abstract = {Collaborative Filtering Recommender Systems},
+  langid = {english}
+}
+
+@article{fieslerMovingLandsOnline2020,
+  title = {Moving across Lands: Online Platform Migration in Fandom Communities},
+  shorttitle = {Moving across Lands},
+  author = {Fiesler, Casey and Dym, Brianna},
+  year = {2020},
+  month = may,
+  journal = {Proc. ACM Hum.-Comput. Interact},
+  volume = {4},
+  number = {CSCW1},
+  pages = {042:1--042:25},
+  doi = {10.1145/3392847},
+  urldate = {2020-06-27},
+  abstract = {When online platforms rise and fall, sometimes communities fade away, and sometimes they pack their bags and relocate to a new home. To explore the causes and effects of online community migration, we examine transformative fandom, a longstanding, technology-agnostic community surrounding the creation, sharing, and discussion of creative works based on existing media. For over three decades, community members have left and joined many different online spaces, from Usenet to Tumblr to platforms of their own design. Through analysis of 28 in-depth interviews and 1,886 survey responses from fandom participants, we traced these migrations, the reasons behind them, and their impact on the community. Our findings highlight catalysts for migration that provide insights into factors that contribute to success and failure of platforms, including issues surrounding policy, design, and community. Further insights into the disruptive consequences of migrations (such as social fragmentation and lost content) suggest ways that platforms might both support commitment and better support migration when it occurs.}
+}
+
+@article{freelonComputationalResearchPostAPI2018,
+  title = {Computational {{Research}} in the {{Post-API Age}}},
+  author = {Freelon, Deen},
+  year = {2018},
+  month = oct,
+  journal = {Political Communication},
+  volume = {35},
+  number = {4},
+  pages = {665--668},
+  publisher = {Routledge},
+  issn = {1058-4609},
+  doi = {10.1080/10584609.2018.1477506},
+  urldate = {2022-04-21},
+  keywords = {API,computational,Facebook,social media,Twitter}
+}
+
+@article{gehlDigitalCovenantNoncentralized2023,
+  title = {The Digital Covenant: Non-Centralized Platform Governance on the Mastodon Social Network},
+  shorttitle = {The Digital Covenant},
+  author = {Gehl, Robert W. and Zulli, Diana},
+  year = {2023},
+  month = dec,
+  journal = {Information, Communication \& Society},
+  volume = {26},
+  number = {16},
+  pages = {3275--3291},
+  publisher = {Routledge},
+  issn = {1369-118X},
+  doi = {10.1080/1369118X.2022.2147400},
+  urldate = {2024-05-31},
+  keywords = {Alternative social media,federalist political theory,mastodon,platform governance,social media}
+}
+
+@article{gillespieContentModerationAI2020,
+  title = {Content Moderation, {{AI}}, and the Question of Scale},
+  author = {Gillespie, Tarleton},
+  year = {2020},
+  month = jul,
+  journal = {Big Data \& Society},
+  volume = {7},
+  number = {2},
+  pages = {2053951720943234},
+  publisher = {SAGE Publications Ltd},
+  issn = {2053-9517},
+  doi = {10.1177/2053951720943234},
+  urldate = {2021-09-28},
+  abstract = {AI seems like the perfect response to the growing challenges of content moderation on social media platforms: the immense scale of the data, the relentlessness of the violations, and the need for human judgments without wanting humans to have to make them. The push toward automated content moderation is often justified as a necessary response to the scale: the enormity of social media platforms like Facebook and YouTube stands as the reason why AI approaches are desirable, even inevitable. But even if we could effectively automate content moderation, it is not clear that we should.},
+  langid = {english},
+  keywords = {Artificial intelligence,bias,content moderation,digital platform,platforms,scale,social media}
+}
+
+@inproceedings{heFlockingMastodonTracking2023,
+  title = {Flocking to {{Mastodon}}: {{Tracking}} the {{Great Twitter Migration}}},
+  shorttitle = {Flocking to {{Mastodon}}},
+  booktitle = {Proceedings of the 2023 {{ACM}} on {{Internet Measurement Conference}}},
+  author = {He, Jiahui and Zia, Haris Bin and Castro, Ignacio and Raman, Aravindh and Sastry, Nishanth and Tyson, Gareth},
+  year = {2023},
+  month = oct,
+  series = {{{IMC}} '23},
+  pages = {111--123},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/3618257.3624819},
+  urldate = {2024-02-22},
+  abstract = {The acquisition of Twitter by Elon Musk has spurred controversy and uncertainty among Twitter users. The move raised both praise and concerns, particularly regarding Musk's views on free speech. As a result, a large number of Twitter users have looked for alternatives to Twitter. Mastodon, a decentralized micro-blogging social network, has attracted the attention of many users and the general media. In this paper, we analyze the migration of 136,009 users from Twitter to Mastodon. We inspect the impact that this has on the wider Mastodon ecosystem, particularly in terms of user-driven pressure towards centralization. We further explore factors that influence users to migrate, highlighting the effect of users' social networks. Finally, we inspect the behavior of individual users, showing how they utilize both Twitter and Mastodon in parallel. We find a clear difference in the topics discussed on the two platforms. This leads us to build classifiers to explore if migration is predictable. Through feature analysis, we find that the content of tweets as well as the number of URLs, the number of likes, and the length of tweets are effective metrics for the prediction of user migration.},
+  isbn = {9798400703829},
+  keywords = {machine learning,mastodon,topic modeling,twitter,user migration}
+}
+
+@article{herlockerEvaluatingCollaborativeFiltering2004,
+  title = {Evaluating Collaborative Filtering Recommender Systems},
+  author = {Herlocker, Jonathan L. and Konstan, Joseph A. and Terveen, Loren G. and Riedl, John T.},
+  year = {2004},
+  month = jan,
+  journal = {ACM Transactions on Information Systems},
+  volume = {22},
+  number = {1},
+  pages = {5--53},
+  issn = {1046-8188},
+  doi = {10.1145/963770.963772},
+  urldate = {2020-08-06},
+  abstract = {Recommender systems have been evaluated in many, often incomparable, ways. In this article, we review the key decisions in evaluating collaborative filtering recommender systems: the user tasks being evaluated, the types of analysis and datasets being used, the ways in which prediction quality is measured, the evaluation of prediction attributes other than quality, and the user-based evaluation of the system as a whole. In addition to reviewing the evaluation strategies used by prior researchers, we present empirical results from the analysis of various accuracy metrics on one content domain where all the tested metrics collapsed roughly into three equivalence classes. Metrics within each equivalency class were strongly correlated, while metrics from different equivalency classes were uncorrelated.},
+  keywords = {Collaborative filtering,evaluation,metrics,recommender systems}
+}
+
+@inproceedings{herlockerExplainingCollaborativeFiltering2000,
+  title = {Explaining Collaborative Filtering Recommendations},
+  booktitle = {Proceedings of the 2000 {{ACM}} Conference on {{Computer}} Supported Cooperative Work},
+  author = {Herlocker, Jonathan L. and Konstan, Joseph A. and Riedl, John},
+  year = {2000},
+  month = dec,
+  series = {{{CSCW}} '00},
+  pages = {241--250},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/358916.358995},
+  urldate = {2020-08-05},
+  abstract = {Automated collaborative filtering (ACF) systems predict a person's affinity for items or information by connecting that person's recorded interests with the recorded interests of a community of people and sharing ratings between like-minded persons. However, current recommender systems are black boxes, providing no transparency into the working of the recommendation. Explanations provide that transparency, exposing the reasoning and data behind a recommendation. In this paper, we address explanation interfaces for ACF systems - how they should be implemented and why they should be implemented. To explore how, we present a model for explanations based on the user's conceptual model of the recommendation process. We then present experimental results demonstrating what components of an explanation are the most compelling. To address why, we present experimental evidence that shows that providing explanations can improve the acceptance of ACF systems. We also describe some initial explorations into measuring how explanations can improve the filtering performance of users.},
+  isbn = {978-1-58113-222-9},
+  keywords = {collaborative filtering,explanations,GroupLens,MoviesLens,recommender systems}
+}
+
+@article{hooverMastodonBumpNow2023,
+  title = {The {{Mastodon Bump Is Now}} a {{Slump}}},
+  author = {Hoover, Amanda},
+  year = {2023},
+  month = feb,
+  journal = {Wired},
+  issn = {1059-1028},
+  urldate = {2023-10-21},
+  abstract = {Active users have fallen by more than 1 million since the exodus from Elon Musk's Twitter, suggesting the decentralized platform is not a direct replacement.},
+  chapter = {tags},
+  langid = {american},
+  keywords = {communities,content moderation,elon musk,mastodon,platforms,social,social media,twitter}
+}
+
+@inproceedings{kieneSurvivingEternalSeptember2016,
+  title = {Surviving an ``{{Eternal September}}'': {{How}} an Online Community Managed a Surge of Newcomers},
+  shorttitle = {Surviving an "{{Eternal September}}"},
+  booktitle = {Proceedings of the 2016 {{CHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  author = {Kiene, Charles and {Monroy-Hern{\'a}ndez}, Andr{\'e}s and Hill, Benjamin Mako},
+  year = {2016},
+  pages = {1152--1156},
+  publisher = {ACM},
+  address = {New York, NY},
+  doi = {10.1145/2858036.2858356},
+  urldate = {2016-07-05},
+  abstract = {We present a qualitative analysis of interviews with participants in the NoSleep community within Reddit where millions of fans and writers of horror fiction congregate. We explore how the community handled a massive, sudden, and sustained increase in new members. Although existing theory and stories like Usenet's infamous "Eternal September" suggest that large influxes of newcomers can hurt online communities, our interviews suggest that NoSleep survived without major incident. We propose that three features of NoSleep allowed it to manage the rapid influx of newcomers gracefully: (1) an active and well-coordinated group of administrators, (2) a shared sense of community which facilitated community moderation, and (3) technological systems that mitigated norm violations. We also point to several important trade-offs and limitations.},
+  isbn = {978-1-4503-3362-7},
+  keywords = {newcomers,norms and governance,online communities,peer production,qualitative methods}
+}
+
+@misc{kingMastodonMe2024,
+  title = {Mastodon {{Near Me}}},
+  author = {King, Jaz-Michael},
+  year = {2024},
+  journal = {jaz-michael king},
+  urldate = {2024-03-04},
+  abstract = {A map and data directory showcasing ActivityPub service providers, each specifically catering to a certain locality or offering support in a notable language.},
+  langid = {english}
+}
+
+@incollection{korenAdvancesCollaborativeFiltering2022,
+  title = {Advances in {{Collaborative Filtering}}},
+  booktitle = {Recommender Systems Handbook},
+  author = {Koren, Yehuda and Rendle, Steffen and Bell, Robert},
+  editor = {Ricci, Francesco and Ro{\d k}a{\d h}, Liʾor and Shapira, Bracha},
+  year = {2022},
+  edition = {Third edition},
+  pages = {91--142},
+  publisher = {Springer},
+  address = {New York, NY},
+  isbn = {978-1-07-162196-7 978-1-07-162199-8},
+  langid = {english}
+}
+
+@misc{krasnoffMastodon101How2022,
+  title = {Mastodon 101: How to Follow (and Unfollow) Other Accounts},
+  shorttitle = {Mastodon 101},
+  author = {Krasnoff, Barbara},
+  year = {2022},
+  month = dec,
+  journal = {The Verge},
+  urldate = {2024-03-04},
+  abstract = {How to get started in Mastodon by following other people},
+  howpublished = {https://www.theverge.com/23519279/mastodon-instance-follow-friend},
+  langid = {english}
+}
+
+@book{krautBuildingSuccessfulOnline2011,
+  ids = {kraut_building_2011,kraut_building_2011-1,kraut_building_2011-3},
+  title = {Building {{Successful Online Communities}}: {{Evidence-Based Social Design}}},
+  shorttitle = {Building {{Successful Online Communities}}},
+  author = {Kraut, Robert E. and Resnick, Paul and Kiesler, Sara},
+  year = {2011},
+  publisher = {MIT Press},
+  address = {Cambridge, Mass},
+  isbn = {978-0-262-01657-5},
+  lccn = {HM742 .K73 2011},
+  keywords = {Computer networks,internet,Online social networks,Planning,Social aspects,Social aspects Planning,Social psychology}
+}
+
+@article{lacavaDriversSocialInfluence2023,
+  title = {Drivers of Social Influence in the {{Twitter}} Migration to {{Mastodon}}},
+  author = {La Cava, Lucio and Aiello, Luca Maria and Tagarelli, Andrea},
+  year = {2023},
+  month = dec,
+  journal = {Scientific Reports},
+  volume = {13},
+  number = {1},
+  pages = {21626},
+  issn = {2045-2322},
+  doi = {10.1038/s41598-023-48200-7},
+  urldate = {2024-02-02},
+  abstract = {The migration of Twitter users to Mastodon following Elon Musk's acquisition presents a unique opportunity to study collective behavior and gain insights into the drivers of coordinated behavior in online media. We analyzed the social network and the public conversations of about 75,000 migrated users and observed that the temporal trace of their migrations is compatible with a phenomenon of social influence, as described by a compartmental epidemic model of information diffusion. Drawing from prior research on behavioral change, we delved into the factors that account for variations of the effectiveness of the influence process across different Twitter communities. Communities in which the influence process unfolded more rapidly exhibit lower density of social connections, higher levels of signaled commitment to migrating, and more emphasis on shared identity and exchange of factual knowledge in the community discussion. These factors account collectively for 57\% of the variance in the observed data. Our results highlight the joint importance of network structure, commitment, and psycho-linguistic aspects of social interactions in characterizing grassroots collective action, and contribute to deepen our understanding of the mechanisms that drive processes of behavior change of online groups.},
+  langid = {english}
+}
+
+@inproceedings{lamAddressingColdstartProblem2008,
+  title = {Addressing Cold-Start Problem in Recommendation Systems},
+  booktitle = {Proceedings of the 2nd International Conference on {{Ubiquitous}} Information Management and Communication},
+  author = {Lam, Xuan Nhat and Vu, Thuc and Le, Trong Duc and Duong, Anh Duc},
+  year = {2008},
+  month = jan,
+  series = {{{ICUIMC}} '08},
+  pages = {208--211},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/1352793.1352837},
+  urldate = {2024-05-21},
+  abstract = {Recommender systems for automatically suggested items of interest to users have become increasingly essential in fields where mass personalization is highly valued. The popular core techniques of such systems are collaborative filtering, content-based filtering and combinations of these. In this paper, we discuss hybrid approaches, using collaborative and also content data to address cold-start - that is, giving recommendations to novel users who have no preference on any items, or recommending items that no user of the community has seen yet. While there have been lots of studies on solving the item-side problems, solution for user-side problems has not been seen public. So we develop a hybrid model based on the analysis of two probabilistic aspect models using pure collaborative filtering to combine with users' information. The experiments with MovieLen data indicate substantial and consistent improvements of this model in overcoming the cold-start user-side problem.},
+  isbn = {978-1-59593-993-7},
+  keywords = {aspect model,cold-start,collaborative filtering,information filtering,three-way aspect model,triadic aspect model}
+}
+
+@techreport{masnickProtocolsNotPlatforms2019,
+  title = {Protocols, {{Not Platforms}}: {{A Technological Approach}} to {{Free Speech}}},
+  shorttitle = {Protocols, {{Not Platforms}}},
+  author = {Masnick, Mike},
+  year = {2019},
+  month = aug,
+  institution = {Knight First Amendment Institute},
+  urldate = {2022-04-21},
+  langid = {english}
+}
+
+@misc{mastodonggmbhServers,
+  title = {Servers},
+  author = {{Mastodon gGmbH}},
+  journal = {Join Mastodon},
+  urldate = {2024-03-04},
+  abstract = {Find where to sign up for the decentralized social network Mastodon.},
+  howpublished = {https://joinmastodon.org/servers},
+  langid = {english}
+}
+
+@article{newellUserMigrationOnline2021,
+  title = {User {{Migration}} in {{Online Social Networks}}: {{A Case Study}} on {{Reddit During}} a {{Period}} of {{Community Unrest}}},
+  author = {Newell, Edward and Jurgens, David and Saleem, Haji Mohammad and Vala, Hardik and Sassine, Jad and Armstrong, Caitrin and Ruths, Derek},
+  year = {2021},
+  month = aug,
+  journal = {Proceedings of the International AAAI Conference on Web and Social Media},
+  pages = {279--288},
+  doi = {10.1609/icwsm.v10i1.14750},
+  abstract = {Platforms like Reddit have attracted large and vibrant communities, but the individuals in those communities are free to migrate to other platforms at any time. History has borne this out with the mass migration from Slashdot to Digg. The underlying motivations of individuals who migrate between platforms, and the conditions that favor migration online are not well-understood. We examine Reddit during a period of community unrest affecting millions of users in the summer of 2015, and analyze large-scale changes in user behavior and migration patterns to Reddit-like alternative platforms. Using self-reported statements from user comments, surveys, and a computational analysis of the activity of users with accounts on multiple platforms, we identify the primary motivations driving user migration. While a notable number of Reddit users left for other platforms, we found that an important pull factor that enabled Reddit to retain users was its long tail of niche content. Other platforms may reach critical mass to support popular or ``mainstream'' topics, but Reddit's large userbase provides a key advantage in supporting niche topics.},
+  langid = {english}
+}
+
+@inproceedings{nicholsonMastodonRulesCharacterizing2023,
+  title = {Mastodon {{Rules}}: {{Characterizing Formal Rules}} on {{Popular Mastodon Instances}}},
+  shorttitle = {Mastodon {{Rules}}},
+  booktitle = {Companion {{Publication}} of the 2023 {{Conference}} on {{Computer Supported Cooperative Work}} and {{Social Computing}}},
+  author = {Nicholson, Matthew N. and Keegan, Brian C and Fiesler, Casey},
+  year = {2023},
+  month = oct,
+  series = {{{CSCW}} '23 {{Companion}}},
+  pages = {86--90},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/3584931.3606970},
+  urldate = {2024-01-16},
+  abstract = {Federated social networking is an increasingly popular alternative to more traditional, centralized forms. Yet, this federated arrangement can lead to dramatically different experiences across the network. Using a sample of the most popular instances on the federated social network Mastodon, we characterize the types of rules present in this emerging space. We then compare these rules to those on Reddit, as an example of a different, less centralized space. Rules on Mastodon often pay particular attention to issues of harassment and hate --- strongly reflecting the spirit of the Mastodon Covenant. We speculate that these rules may have emerged in response to problems of other platforms, and reflect a lack of support for instance maintainers. With this work, we call for the development of additional instance-level governance and technical scaffolding, and raise questions for future work into the development, values, and value tensions present in the broader federated social networking landscape.},
+  isbn = {9798400701290},
+  keywords = {community rules,Mastodon,online communities}
+}
+
+@article{paterekImprovingRegularizedSingular2007,
+  title = {Improving Regularized Singular Value Decomposition for Collaborative Filtering},
+  author = {Paterek, Arkadiusz},
+  year = {2007},
+  month = aug,
+  journal = {Proceedings of KDD cup and workshop},
+  abstract = {A key part of a recommender system is a collaborative filtering algorithm predicting users' preferences for items. In this paper we describe different efficient collaborative filtering techniques and a framework for combining them to obtain a good prediction.},
+  langid = {english}
+}
+
+@inproceedings{ramanChallengesDecentralisedWeb2019,
+  title = {Challenges in the {{Decentralised Web}}: {{The Mastodon Case}}},
+  shorttitle = {Challenges in the {{Decentralised Web}}},
+  booktitle = {Proceedings of the {{Internet Measurement Conference}}},
+  author = {Raman, Aravindh and Joglekar, Sagar and Cristofaro, Emiliano De and Sastry, Nishanth and Tyson, Gareth},
+  year = {2019},
+  month = oct,
+  series = {{{IMC}} '19},
+  pages = {217--229},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/3355369.3355572},
+  urldate = {2024-03-06},
+  abstract = {The Decentralised Web (DW) has recently seen a renewed momentum, with a number of DW platforms like Mastodon, PeerTube, and Hubzilla gaining increasing traction. These offer alternatives to traditional social networks like Twitter, YouTube, and Facebook, by enabling the operation of web infrastructure and services without centralised ownership or control. Although their services differ greatly, modern DW platforms mostly rely on two key innovations: first, their open source software allows anybody to setup independent servers ("instances") that people can sign-up to and use within a local community; and second, they build on top of federation protocols so that instances can mesh together, in a peer-to-peer fashion, to offer a globally integrated platform. In this paper, we present a measurement-driven exploration of these two innovations, using a popular DW microblogging platform (Mastodon) as a case study. We focus on identifying key challenges that might disrupt continuing efforts to decentralise the web, and empirically highlight a number of properties that are creating natural pressures towards re-centralisation. Finally, our measurements shed light on the behaviour of both administrators (i.e., people setting up instances) and regular users who sign-up to the platforms, also discussing a few techniques that may address some of the issues observed.},
+  isbn = {978-1-4503-6948-0}
+}
+
+@inproceedings{resnickGrouplensOpenArchitecture1994,
+  title = {Grouplens: An Open Architecture for Collaborative Filtering of Netnews},
+  shorttitle = {Grouplens},
+  booktitle = {Proceedings of the 1994 {{ACM Conference}} on {{Computer Supported Cooperative Work}}},
+  author = {Resnick, Paul and Iacovou, Neophytos and Suchak, Mitesh and Bergstrom, Peter and Riedl, John},
+  year = {1994},
+  series = {{{CSCW}} '94},
+  pages = {175--186},
+  publisher = {ACM},
+  address = {New York, NY, USA},
+  doi = {10.1145/192844.192905},
+  urldate = {2016-07-19},
+  abstract = {Collaborative filters help people make choices based on the opinions of other people. GroupLens is a system for collaborative filtering of netnews, to help people find articles they will like in the huge stream of available articles. News reader clients display predicted scores and make it easy for users to rate articles after they read them. Rating servers, called Better Bit Bureaus, gather and disseminate the ratings. The rating servers predict scores based on the heuristic that people who agreed in the past will probably agree again. Users can protect their privacy by entering ratings under pseudonyms, without reducing the effectiveness of the score prediction. The entire architecture is open: alternative software for news clients and Better Bit Bureaus can be developed  independently and can interoperate with the components we have developed.},
+  isbn = {978-0-89791-689-9}
+}
+
+@book{ricciRecommenderSystemsHandbook2022,
+  title = {Recommender Systems Handbook},
+  editor = {Ricci, Francesco and Ro{\d k}a{\d h}, Liʾor and Shapira, Bracha},
+  year = {2022},
+  edition = {Third edition},
+  publisher = {Springer},
+  address = {New York, NY},
+  abstract = {This third edition handbook describes in detail the classical methods as well as extensions and novel approaches that were more recently introduced within this field. It consists of five parts: general recommendation techniques, special recommendation techniques, value and impact of recommender systems, human computer interaction, and applications. The first part presents the most popular and fundamental techniques currently used for building recommender systems, such as collaborative filtering, semantic-based methods, recommender systems based on implicit feedback, neural networks and context-aware methods. The second part of this handbook introduces more advanced recommendation techniques, such as session-based recommender systems, adversarial machine learning for recommender systems, group recommendation techniques, reciprocal recommenders systems, natural language techniques for recommender systems and cross-domain approaches to recommender systems. The third part covers a wide perspective to the evaluation of recommender systems with papers on methods for evaluating recommender systems, their value and impact, the multi-stakeholder perspective of recommender systems, the analysis of the fairness, novelty and diversity in recommender systems. The fourth part contains a few chapters on the human computer dimension of recommender systems, with research on the role of explanation, the user personality and how to effectively support individual and group decision with recommender systems. The last part focusses on application in several important areas, such as, food, music, fashion and multimedia recommendation. This informative third edition handbook provides a comprehensive, yet concise and convenient reference source to recommender systems for researchers and advanced-level students focused on computer science and data science. Professionals working in data analytics that are using recommendation and personalization techniques will also find this handbook a useful tool},
+  isbn = {978-1-07-162196-7 978-1-07-162199-8},
+  langid = {english}
+}
+
+@article{robertsonProbabilisticRelevanceFramework2009,
+  title = {The {{Probabilistic Relevance Framework}}: {{BM25}} and {{Beyond}}},
+  shorttitle = {The {{Probabilistic Relevance Framework}}},
+  author = {Robertson, Stephen and Zaragoza, Hugo},
+  year = {2009},
+  journal = {Foundations and Trends{\textregistered} in Information Retrieval},
+  volume = {3},
+  number = {4},
+  pages = {333--389},
+  issn = {1554-0669, 1554-0677},
+  doi = {10.1561/1500000019},
+  urldate = {2024-05-20},
+  abstract = {The Probabilistic Relevance Framework (PRF) is a formal framework for document retrieval, grounded in work done in the 1970--1980s, which led to the development of one of the most successful text-retrieval algorithms, BM25. In recent years, research in the PRF has yielded new retrieval models capable of taking into account document meta-data (especially structure and link-graph information). Again, this has led to one of the most successful Web-search and corporate-search algorithms, BM25F. This work presents the PRF from a conceptual point of view, describing the probabilistic modelling assumptions behind the framework and the different ranking algorithms that result from its application: the binary independence model, relevance feedback models, BM25 and BM25F. It also discusses the relation between the PRF and other statistical models for IR, and covers some related topics, such as the use of non-textual features, and parameter optimisation for models with free parameters.},
+  langid = {english}
+}
+
+@misc{rochkoMastodon2023,
+  title = {Mastodon 4.2},
+  author = {Rochko, Eugen},
+  year = {2023},
+  month = sep,
+  journal = {Mastodon Blog},
+  urldate = {2024-03-06},
+  abstract = {In this massive update we've added search and removed friction. What's not to love?},
+  howpublished = {https://blog.joinmastodon.org/2023/09/mastodon-4.2/}
+}
+
+@misc{rochkoNewOnboardingExperience2023,
+  title = {A New Onboarding Experience on {{Mastodon}}},
+  author = {Rochko, Eugen},
+  year = {2023},
+  month = may,
+  journal = {Mastodon Blog},
+  urldate = {2024-03-04},
+  abstract = {Today we're making signing up on Mastodon easier than ever before. We understand that deciding which Mastodon service provider to kick off your experience with can be confusing. We know this is a completely new concept for many people, since traditionally the platform and the service provider are one and the same. This choice is what makes Mastodon different from existing social networks, but it also presents a unique onboarding challenge.},
+  howpublished = {https://blog.joinmastodon.org/2023/05/a-new-onboarding-experience-on-mastodon/}
+}
+
+@misc{rothItGettingEasier2023,
+  title = {It's Getting Easier to Make an Account on {{Mastodon}}},
+  author = {Roth, Emma},
+  year = {2023},
+  month = may,
+  journal = {The Verge},
+  urldate = {2024-03-04},
+  abstract = {The network lets you sign up for mastodon.social from the start.},
+  howpublished = {https://www.theverge.com/2023/5/1/23707019/mastodon-account-creation-twitter-alternative},
+  langid = {english}
+}
+
+@misc{rousseauMastodonInstances2017,
+  title = {Mastodon Instances},
+  author = {Rousseau, Amaury},
+  year = {2017},
+  journal = {instances.social},
+  urldate = {2024-03-04},
+  howpublished = {https://instances.social/}
+}
+
+@book{saltonIntroductionModernInformation1987,
+  title = {Introduction to Modern Information Retrieval},
+  author = {Salton, Gerard and McGill, Michael J.},
+  year = {1987},
+  series = {{{McGraw-Hill}} International Editions},
+  edition = {3. pr},
+  publisher = {McGraw-Hill Book Comp},
+  address = {New York},
+  isbn = {978-0-07-054484-0},
+  langid = {english}
+}
+
+@inproceedings{sarwarItembasedCollaborativeFiltering2001,
+  title = {Item-Based Collaborative Filtering Recommendation Algorithms},
+  booktitle = {Proceedings of the 10th International Conference on {{World Wide Web}}},
+  author = {Sarwar, Badrul and Karypis, George and Konstan, Joseph and Riedl, John},
+  year = {2001},
+  month = apr,
+  series = {{{WWW}} '01},
+  pages = {285--295},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/371920.372071},
+  urldate = {2024-05-07},
+  isbn = {978-1-58113-348-6}
+}
+
+@incollection{schaferCollaborativeFilteringRecommender2007,
+  title = {Collaborative Filtering Recommender Systems},
+  booktitle = {The {{Adaptive Web}}: {{Methods}} and {{Strategies}} of {{Web Personalization}}},
+  author = {Schafer, J. Ben and Frankowski, Dan and Herlocker, Jon and Sen, Shilad},
+  editor = {Brusilovsky, Peter and Kobsa, Alfred and Nejdl, Wolfgang},
+  year = {2007},
+  series = {Lecture {{Notes}} in {{Computer Science}}},
+  pages = {291--324},
+  publisher = {Springer},
+  address = {Berlin, Heidelberg},
+  doi = {10.1007/978-3-540-72079-9_9},
+  urldate = {2020-08-06},
+  abstract = {One of the potent personalization technologies powering the adaptive web is collaborative filtering. Collaborative filtering (CF) is the process of filtering or evaluating items through the opinions of other people. CF technology brings together the opinions of large interconnected communities on the web, supporting filtering of substantial quantities of data. In this chapter we introduce the core concepts of collaborative filtering, its primary uses for users of the adaptive web, the theory and practice of CF algorithms, and design decisions regarding rating systems and acquisition of ratings. We also discuss how to evaluate CF systems, and the evolution of rich interaction interfaces. We close the chapter with discussions of the challenges of privacy particular to a CF recommendation service and important open research questions in the field.},
+  isbn = {978-3-540-72079-9},
+  langid = {english},
+  keywords = {Association Rule Mining,Collaborative Filter,Explicit Rating,News Article,Recommender System}
+}
+
+@misc{silberlingBeginnerGuideMastodon2023,
+  title = {A Beginner's Guide to {{Mastodon}}, the Open Source {{Twitter}} Alternative {\textbar} {{TechCrunch}}},
+  author = {Silberling, Amanda},
+  year = {2023},
+  month = jul,
+  journal = {TechCrunch},
+  urldate = {2024-03-04},
+  howpublished = {https://techcrunch.com/2023/07/24/what-is-mastodon/}
+}
+
+@article{suSurveyCollaborativeFiltering2009,
+  title = {A {{Survey}} of {{Collaborative Filtering Techniques}}},
+  author = {Su, Xiaoyuan and Khoshgoftaar, Taghi M.},
+  year = {2009},
+  month = oct,
+  journal = {Advances in Artificial Intelligence},
+  volume = {2009},
+  pages = {e421425},
+  publisher = {Hindawi},
+  issn = {1687-7470},
+  doi = {10.1155/2009/421425},
+  urldate = {2024-05-09},
+  abstract = {As one of the most successful approaches to building recommender systems, collaborative filtering (CF) uses the known preferences of a group of users to make recommendations or predictions of the unknown preferences for other users. In this paper, we first introduce CF tasks and their main challenges, such as data sparsity, scalability, synonymy, gray sheep, shilling attacks, privacy protection, etc., and their possible solutions. We then present three main categories of CF techniques: memory-based, model-based, and hybrid CF algorithms (that combine CF with other recommendation techniques), with examples for representative algorithms of each category, and analysis of their predictive performance and their ability to address the challenges. From basic techniques to the state-of-the-art, we attempt to present a comprehensive survey for CF techniques, which can be served as a roadmap for research and practice in this area.},
+  langid = {english}
+}
+
+@inproceedings{teblunthuisIdentifyingCompetitionMutualism2022,
+  title = {Identifying Competition and Mutualism between Online Groups},
+  booktitle = {International {{AAAI Conference}} on {{Web}} and {{Social Media}} ({{ICWSM}} 2022)},
+  author = {TeBlunthuis, Nathan and Hill, Benjamin Mako},
+  year = {2022},
+  month = jun,
+  volume = {16},
+  pages = {993--1004},
+  publisher = {AAAI},
+  address = {Atlanta, Georgia, USA},
+  urldate = {2021-07-16},
+  abstract = {Platforms often host multiple online groups with highly overlapping topics and members. How can researchers and designers understand how interactions between related groups affect measures of group health? Inspired by population ecology, prior social computing research has studied competition and mutualism among related groups by correlating group size with degrees of overlap in content and membership. The resulting body of evidence is puzzling as overlaps seem sometimes to help and other times to hurt. We suggest that this confusion results from aggregating inter-group relationships into an overall environmental effect instead of focusing on networks of competition and mutualism among groups. We propose a theoretical framework based on community ecology and a method for inferring competitive and mutualistic interactions from time series participation data. We compare population and community ecology analyses of online community growth by analyzing clusters of subreddits with high user overlap but varying degrees of competition and mutualism.},
+  keywords = {Computer Science - Human-Computer Interaction,Computer Science - Social and Information Networks}
+}
+
+@misc{trienesRecommendingUsersWhom2018,
+  title = {Recommending {{Users}}: {{Whom}} to {{Follow}} on {{Federated Social Networks}}},
+  shorttitle = {Recommending {{Users}}},
+  author = {Trienes, Jan and Cano, Andr{\'e}s Torres and Hiemstra, Djoerd},
+  year = {2018},
+  month = nov,
+  number = {arXiv:1811.09292},
+  eprint = {1811.09292},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.1811.09292},
+  urldate = {2024-03-06},
+  abstract = {To foster an active and engaged community, social networks employ recommendation algorithms that filter large amounts of contents and provide a user with personalized views of the network. Popular social networks such as Facebook and Twitter generate follow recommendations by listing profiles a user may be interested to connect with. Federated social networks aim to resolve issues associated with the popular social networks - such as large-scale user-surveillance and the miss-use of user data to manipulate elections - by decentralizing authority and promoting privacy. Due to their recent emergence, recommender systems do not exist for federated social networks, yet. To make these networks more attractive and promote community building, we investigate how recommendation algorithms can be applied to decentralized social networks. We present an offline and online evaluation of two recommendation strategies: a collaborative filtering recommender based on BM25 and a topology-based recommender using personalized PageRank. Our experiments on a large unbiased sample of the federated social network Mastodon shows that collaborative filtering approaches outperform a topology-based approach, whereas both approaches significantly outperform a random recommender. A subsequent live user experiment on Mastodon using balanced interleaving shows that the collaborative filtering recommender performs on par with the topology-based recommender.},
+  archiveprefix = {arXiv},
+  keywords = {Computer Science - Information Retrieval,Computer Science - Social and Information Networks}
+}
+
+@article{webberSimilarityMeasureIndefinite2010,
+  title = {A Similarity Measure for Indefinite Rankings},
+  author = {Webber, William and Moffat, Alistair and Zobel, Justin},
+  year = {2010},
+  month = nov,
+  journal = {ACM Transactions on Information Systems},
+  volume = {28},
+  number = {4},
+  pages = {20:1--20:38},
+  issn = {1046-8188},
+  doi = {10.1145/1852102.1852106},
+  urldate = {2024-02-14},
+  abstract = {Ranked lists are encountered in research and daily life and it is often of interest to compare these lists even when they are incomplete or have only some members in common. An example is document rankings returned for the same query by different search engines. A measure of the similarity between incomplete rankings should handle nonconjointness, weight high ranks more heavily than low, and be monotonic with increasing depth of evaluation; but no measure satisfying all these criteria currently exists. In this article, we propose a new measure having these qualities, namely rank-biased overlap (RBO). The RBO measure is based on a simple probabilistic user model. It provides monotonicity by calculating, at a given depth of evaluation, a base score that is non-decreasing with additional evaluation, and a maximum score that is nonincreasing. An extrapolated score can be calculated between these bounds if a point estimate is required. RBO has a parameter which determines the strength of the weighting to top ranks. We extend RBO to handle tied ranks and rankings of different lengths. Finally, we give examples of the use of the measure in comparing the results produced by public search engines and in assessing retrieval systems in the laboratory.},
+  keywords = {probabilistic models,Rank correlation,ranking}
+}
+
+@article{zangerleEvaluatingRecommenderSystems2022,
+  title = {Evaluating {{Recommender Systems}}: {{Survey}} and {{Framework}}},
+  shorttitle = {Evaluating {{Recommender Systems}}},
+  author = {Zangerle, Eva and Bauer, Christine},
+  year = {2022},
+  month = dec,
+  journal = {ACM Computing Surveys},
+  volume = {55},
+  number = {8},
+  pages = {170:1--170:38},
+  issn = {0360-0300},
+  doi = {10.1145/3556536},
+  urldate = {2024-05-07},
+  abstract = {The comprehensive evaluation of the performance of a recommender system is a complex endeavor: many facets need to be considered in configuring an adequate and effective evaluation setting. Such facets include, for instance, defining the specific goals of the evaluation, choosing an evaluation method, underlying data, and suitable evaluation metrics. In this article, we consolidate and systematically organize this dispersed knowledge on recommender systems evaluation. We introduce the Framework for Evaluating Recommender systems (FEVR), which we derive from the discourse on recommender systems evaluation. In FEVR, we categorize the evaluation space of recommender systems evaluation. We postulate that the comprehensive evaluation of a recommender system frequently requires considering multiple facets and perspectives in the evaluation. The FEVR framework provides a structured foundation to adopt adequate evaluation configurations that encompass this required multi-facetedness and provides the basis to advance in the field. We outline and discuss the challenges of a comprehensive evaluation of recommender systems and provide an outlook on what we need to embrace and do to move forward as a research community.},
+  keywords = {FEVR,Framework for EValuating Recommender systems,Survey}
+}
+
+@inproceedings{zhuPopularityOpportunityBiasCollaborative2021,
+  title = {Popularity-{{Opportunity Bias}} in {{Collaborative Filtering}}},
+  booktitle = {Proceedings of the 14th {{ACM International Conference}} on {{Web Search}} and {{Data Mining}}},
+  author = {Zhu, Ziwei and He, Yun and Zhao, Xing and Zhang, Yin and Wang, Jianling and Caverlee, James},
+  year = {2021},
+  month = mar,
+  series = {{{WSDM}} '21},
+  pages = {85--93},
+  publisher = {Association for Computing Machinery},
+  address = {New York, NY, USA},
+  doi = {10.1145/3437963.3441820},
+  urldate = {2024-05-21},
+  abstract = {This paper connects equal opportunity to popularity bias in implicit recommenders to introduce the problem of popularity-opportunity bias. That is, conditioned on user preferences that a user likes both items, the more popular item is more likely to be recommended (or ranked higher) to the user than the less popular one. This type of bias is harmful, exerting negative effects on the engagement of both users and item providers. Thus, we conduct a three-part study: (i) By a comprehensive empirical study, we identify the existence of the popularity-opportunity bias in fundamental matrix factorization models on four datasets; (ii) coupled with this empirical study, our theoretical study shows that matrix factorization models inherently produce the bias; and (iii) we demonstrate the potential of alleviating this bias by both in-processing and post-processing algorithms. Extensive experiments on four datasets show the effective debiasing performance of these proposed methods compared with baselines designed for conventional popularity bias.},
+  isbn = {978-1-4503-8297-7},
+  keywords = {equal opportunity,recommendation bias,recommender systems,statistical parity}
+}
+
+@article{zulliRethinkingSocialSocial2020,
+  title = {Rethinking the ``Social'' in ``Social Media'': {{Insights}} into Topology, Abstraction, and Scale on the {{Mastodon}} Social Network},
+  shorttitle = {Rethinking the ``Social'' in ``Social Media''},
+  author = {Zulli, Diana and Liu, Miao and Gehl, Robert},
+  year = {2020},
+  month = jul,
+  journal = {New Media \& Society},
+  volume = {22},
+  number = {7},
+  pages = {1188--1205},
+  publisher = {SAGE Publications},
+  issn = {1461-4448},
+  doi = {10.1177/1461444820912533},
+  urldate = {2022-03-13},
+  abstract = {Online interactions are often understood through the corporate social media (CSM) model where social interactions are determined through layers of abstraction and centralization that eliminate users from decision-making processes. This study demonstrates how alternative social media (ASM)?namely Mastodon?restructure the relationship between the technical structure of social media and the social interactions that follow, offering a particular type of sociality distinct from CSM. Drawing from a variety of qualitative data, this analysis finds that (1) the decentralized structure of Mastodon enables community autonomy, (2) Mastodon?s open-source protocol allows the internal and technical development of the site to become a social enterprise in and of itself, and (3) Mastodon?s horizontal structure shifts the site?s scaling focus from sheer number of users to quality engagement and niche communities. To this end, Mastodon helps us rethink ?the social? in social media in terms of topology, abstraction, and scale.}
+}
--- a/manuscripts/deweb/template.tex
+++ b/manuscripts/deweb/template.tex
@ -0,0 +1,167 @@
+\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
+\usepackage[submission]{aaai24}  % DO NOT CHANGE THIS
+\usepackage{times}  % DO NOT CHANGE THIS
+\usepackage{helvet}  % DO NOT CHANGE THIS
+\usepackage{courier}  % DO NOT CHANGE THIS
+\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\frenchspacing  % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{booktabs}
+\usepackage{siunitx}
+
+
+
+\usepackage{graphicx}
+\makeatletter
+\newsavebox\pandoc@box
+\newcommand*\pandocbounded[1]{% scales image to fit in text height/width
+  \sbox\pandoc@box{#1}%
+  \Gscale@div\@tempa{\textheight}{\dimexpr\ht\pandoc@box+\dp\pandoc@box\relax}%
+  \Gscale@div\@tempb{\linewidth}{\wd\pandoc@box}%
+  \ifdim\@tempb\p@<\@tempa\p@\let\@tempa\@tempb\fi% select the smaller of both
+  \ifdim\@tempa\p@<\p@\scalebox{\@tempa}{\usebox\pandoc@box}%
+  \else\usebox{\pandoc@box}%
+  \fi%
+}
+% Set default figure placement to htbp
+\def\fps@figure{htbp}
+\makeatother
+
+\usepackage{tabularray}
+
+
+%
+% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
+%\usepackage{algorithm}
+%\usepackage{algorithmic}
+
+%
+% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
+%\usepackage{newfloat}
+%\usepackage{listings}
+%\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
+%\lstset{%
+%	basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
+%	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
+%	aboveskip=0pt,belowskip=0pt,%
+%	showstringspaces=false,tabsize=2,breaklines=true}
+%\floatstyle{ruled}
+%\newfloat{listing}{tb}{lst}{}
+%\floatname{listing}{Listing}
+%
+% Keep the \pdfinfo as shown here. There's no need
+% for you to add the /Title and /Author tags.
+\pdfinfo{
+/TemplateVersion (2024.1)
+}
+
+% DISALLOWED PACKAGES
+% \usepackage{authblk} -- This package is specifically forbidden
+% \usepackage{balance} -- This package is specifically forbidden
+% \usepackage{color (if used in text)
+% \usepackage{CJK} -- This package is specifically forbidden
+% \usepackage{float} -- This package is specifically forbidden
+% \usepackage{flushend} -- This package is specifically forbidden
+% \usepackage{fontenc} -- This package is specifically forbidden
+% \usepackage{fullpage} -- This package is specifically forbidden
+% \usepackage{geometry} -- This package is specifically forbidden
+% \usepackage{grffile} -- This package is specifically forbidden
+% \usepackage{hyperref} -- This package is specifically forbidden
+% \usepackage{navigator} -- This package is specifically forbidden
+% (or any other package that embeds links such as navigator or hyperref)
+% \indentfirst} -- This package is specifically forbidden
+% \layout} -- This package is specifically forbidden
+% \multicol} -- This package is specifically forbidden
+% \nameref} -- This package is specifically forbidden
+% \usepackage{savetrees} -- This package is specifically forbidden
+% \usepackage{setspace} -- This package is specifically forbidden
+% \usepackage{stfloats} -- This package is specifically forbidden
+% \usepackage{tabu} -- This package is specifically forbidden
+% \usepackage{titlesec} -- This package is specifically forbidden
+% \usepackage{tocbibind} -- This package is specifically forbidden
+% \usepackage{ulem} -- This package is specifically forbidden
+% \usepackage{wrapfig} -- This package is specifically forbidden
+% DISALLOWED COMMANDS
+% \nocopyright -- Your paper will not be published if you use this command
+% \addtolength -- This command may not be used
+% \balance -- This command may not be used
+% \baselinestretch -- Your paper will not be published if you use this command
+% \clearpage -- No page breaks of any kind may be used for the final version of your paper
+% \columnsep -- This command may not be used
+% \newpage -- No page breaks of any kind may be used for the final version of your paper
+% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
+% \pagestyle -- This command may not be used
+% \tiny -- This is not an acceptable font size.
+% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
+% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
+
+\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
+\newcolumntype{d}{S[
+    input-open-uncertainty=,
+    input-close-uncertainty=,
+    parse-numbers = false,
+    table-align-text-pre=false,
+    table-align-text-post=false
+ ]}
+
+
+\def\tightlist{}
+\def\phantomsection{}
+\newcommand\hypertarget[2]{#2}
+\newcommand\texorpdfstring[2]{#1}
+\newcommand\bookmarksetup[1]{}
+\newcommand\href[1]{#1}
+
+\usepackage{longtable}
+%\renewenvironment{longtable}{\begin{center}\begin{tabular}}{\end{tabular}\end{center}}
+%\def\endhead{}
+%\renewcommand{\toprule}[2]{\hline}
+%\renewcommand{\midrule}[2]{\hline}
+%renewcommand{\bottomrule}[2]{\hline}
+% long table two column hack
+\makeatletter
+\let\oldlt\longtable
+\let\endoldlt\endlongtable
+\def\longtable{\@ifnextchar[\longtable@i \longtable@ii}
+\def\longtable@i[#1]{\begin{figure}[htbp]
+\begin{minipage}{0.5\textwidth}
+\onecolumn
+\oldlt[#1]
+}
+\def\longtable@ii{\begin{figure}[htbp]
+\begin{minipage}{0.5\textwidth}
+\onecolumn
+\oldlt
+}
+\def\endlongtable{\endoldlt
+\end{minipage}
+\twocolumn
+\end{figure}}
+\makeatother
+
+\title{$title$}
+
+\begin{document}
+
+\maketitle
+
+$if(abstract)$
+\begin{abstract}
+$abstract$
+\end{abstract}
+$endif$
+
+$body$
+
+\bibliography{$bibliography$}
+
+\end{document}
--- a/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/_extension.yml
+++ b/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/_extension.yml
--- a/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/conf_template.tex
+++ b/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/conf_template.tex
--- a/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/partials/before-body.tex
+++ b/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/partials/before-body.tex
--- a/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/partials/title.tex
+++ b/manuscripts/ic2s2-2024/_extensions/carlcolglazier/ic2s2/partials/title.tex
--- a/manuscripts/ic2s2-2024/index.qmd
+++ b/manuscripts/ic2s2-2024/index.qmd
@ -0,0 +1,161 @@
+---
+title: "Do Servers Matter on Mastodon? Data-driven Design for Decentralized Social Media"
+author: Carl Colglazier
+bibliography: ../../references.bib
+format:
+  ic2s2-pdf:
+    include-in-header:
+      - text: |
+          \usepackage{tabularray}
+execute:
+  freeze: true
+  echo: false
+  error: false
+  warning: false
+  message: false
+  cache: false
+knitr:  
+  opts_knit: 
+    verbose: true
+---
+
+```{r, cache.extra = tools::md5sum("codebase/R/helpers.R")}
+#| label: fig-account-timeline
+#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
+#| fig-height: 2.75
+#| fig-width: 6.75
+#| fig-env: figure*
+#| fig-pos: tb!
+
+library(here)
+source(here("codebase/R/helpers.R"))
+get_here <- here::here
+account_timeline_plot()
+```
+
+Following Twitter's 2022 acquisition, Mastodon---an open-source, decentralized social network and microblogging community---saw an increase in activity and attention as a potential Twitter alternative [@heFlockingMastodonTracking2023; @lacavaDriversSocialInfluence2023]. While millions of people set up new accounts and significantly increased the size of the network (@fig-account-timeline), many of these newcomers and potential newcomers found the process confusing and many accounts did not remain active. Unlike centralized social media platforms, Mastodon is a network of independent servers with their own rules and norms [@nicholsonMastodonRulesCharacterizing2023]. Each server can communicate with each other using the shared ActivityPub protocols and accounts can move between Mastodon servers, but the local experience can vary widely from server to server.
+
+Although attracting and retaining newcomers is a key challenge for online communities [@krautBuildingSuccessfulOnline2011 p. 182], Mastodon's onboarding process has not always been straightforward. Variation among servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. Further, many Mastodon servers have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Various guides and resources for people trying to join Mastodon offered mixed advice on choosing a server. Some suggest that the most important thing is to simply join any server and work from there [@krasnoffMastodon101How2022; @silberlingBeginnerGuideMastodon2023], while others have created tools and guides to help people find potential servers of interest by size and location[@rousseauMastodonInstances2017; @kingMastodonMe2024].
+
+Mastodon's approach to onboarding has also changed over time. In much of 2020 and early 2021, the Mastodon developers closed signups to their flagship server and linked to an alternative server, which saw increased sign-ups during this period. They also linked to a list of servers on the Join Mastodon webpage [@mastodonggmbhServers], where all servers are pre-approved and follow the Mastodon Server Covenant which guarantees certain content moderation standards and data protections. Starting in 2023, the Mastodon developers shifted toward making the flagship server the default when people sign up on the official Mastodon Android and iOS apps [@rochkoNewOnboardingExperience2023; @rothItGettingEasier2023].
+
+We first ask question: *Does server choice matter for Mastodon newcomers?* Toward this question, we used profile data from over a million Mastodon accounts collected from public timelines and profile directories between October 1, 2020 and August 15, 2023. With a subset of these accounts created from May 1 to June 30, 2023, we create a Kaplan–Meier estimator for account activity in the 91 days after creation (@fig-survival). We find that accounts on the 12 largest general instances featured at the top of the Join Mastodon webpage (which includes the flagship server) are less likely to remain active than accounts created on other Join Mastodon servers.
+
+To corroborate this model, we also use data from thousands of accounts which moved between Mastodon servers, taking advantage of the data portability of the platform. Conceiving of these moved accounts as edges within a weighted directional network where nodes represent servers, edges represent accounts, and weights represent the number of accounts that moved between servers, we construct an exponential family random graph model (ERGM) with terms for server size, open registrations, and language match between servers. We find that accounts are more likely to move from larger servers to smaller servers.
+
+```{=html}
+<!--
+We found that users who sign up on large, general topic servers are less likely to remain active than those who sign up on smaller servers. We also found that many users who move their accounts between servers tend to gravitate toward smaller servers over time.
+-->
+```
+Based on these findings, we suggest a need for better ways for potential newcomers to find servers and propose a viable way to create server and tag recommendations on Mastodon, which could both help newcomers find servers that match their interests and help established accounts discover "neighborhoods" of related servers. One challenge in building such a system is the decentralized nature of the system. A single, central actor which collects data from servers and then distributes recommendations would be antithetical to the decentralized nature of Mastodon. Instead, we propose a system where servers can report the top hashtags by the number of unique accounts on the server using them during the last three months. Such a system would be opt-in and require few additional server resources since tags already have their own database table.
+
+In our proposal, after collecting these top tags on each server, each server then uses Okapi BM25 to construct a term frequency-inverse document frequency (TF-IDF) matrix to associate the top tags with each server in their known network. We suggest first filtering to only consider tags used by a minimal number of account on a server and only consider tags used on a minimal number of servers. The counts of tag-account pairs from each server make up the term frequency and the number of servers that use each tag make up the inverse document frequency. The system can then apply L2 normalization to the vectors for each tag and calculate the cosine similarity between the tag vectors for each server. To find similarity between tags, the system could also calculate the cosine similarity between the server vectors.
+
+To determine the viability of the recommendation system, we simulated various scenarios that limit both servers that report data and the number of tags they report. We used rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. @fig-simulations-rbo shows how the average agreement with the baseline scales linearly with the logarithm of the tag count.
+
+Thus based on analysis of trace data from millions of new Mastodon accounts, we find evidence that suggests that servers matter and that users tend to move from larger servers to smaller servers. We then propose a recommendation system that can help new Mastodon users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effectively deployed in a federated manner, even with limited data on each local server.
+
+```{r, cache.extra = tools::md5sum("codebase/R/survival.R")}
+#| cache: true
+#| label: fig-survival
+#| fig-cap: "Survival probabilities for accounts created during May and June 2023 on servers featured on Join Mastodon. Groups represent whether the account is on one of the 12 largest and most prominently featured servers or another Join Mastodon server."
+library(here)
+source(here("codebase/R/survival.R"))
+plot_km
+```
+
+::: {#tbl-ergm-table}
+```{r}
+#| label: table-ergm-table
+#| echo: false
+#| warning: false
+#| message: false
+#| error: false
+
+library(here)
+library(modelsummary)
+library(kableExtra)
+library(purrr)
+library(stringr)
+load(file = here("data/scratch/ergm-model-early.rda"))
+load(file = here("data/scratch/ergm-model-late.rda"))
+
+x <- modelsummary(
+  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
+  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
+  statistic = NULL,
+  gof_omit = ".*",
+  coef_rename = c(
+    "sum" = "Sum",
+    "nonzero" = "Nonzero",
+    "diff.sum0.h-t.accounts" = "Smaller server",
+    "nodeocov.sum.accounts" = "Server size\n(outgoing)",
+    "nodeifactor.sum.registrations.TRUE" = "Open registrations\n(incoming)",
+    "nodematch.sum.language" = "Languages match"
+  ),
+  align="lrrrr",
+  stars = c('*' = .05, '**' = 0.01, '***' = .001),
+  output = "latex_tabular"
+  #output = "markdown",
+  #table.envir='table*',
+  #table.env="table*"
+  ) #%>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
+
+x
+```
+
+Exponential family random graph models for account movement between Mastodon servers. Accounts in Model A were created in May 2022 and moved to another account at some later point. Accounts in Model B were created at some earlier point and moved after October 2023.
+:::
+
+::: {#tbl-sim-servers}
+```{r}
+#| label: table-sim-servers
+library(tidyverse)
+library(arrow)
+
+sim_servers <- "data/scratch/server_similarity.feather" %>%
+  here::here() %>%
+  arrow::read_ipc_file() 
+server_of_interest <- "hci.social"
+server_table <- sim_servers %>%
+    arrange(desc(Similarity)) %>%
+    filter(Source == server_of_interest | Target == server_of_interest) %>%
+    head(5) %>%
+    pivot_longer(cols=c(Source, Target)) %>%
+    filter(value != server_of_interest) %>%
+    select(value, Similarity) %>%
+    rename("Server" = "value")
+
+if (knitr::is_latex_output()) {
+  server_table %>% knitr::kable(format="latex", booktabs=TRUE, digits=3)
+} else {
+  server_table %>% knitr::kable(digits = 3)
+}
+```
+
+Top five servers most similar to hci.social, a Mastodon server focused on human-computer interaction research. Each of these servers relate to computer science, academia, or technology.
+:::
+
+```{r}
+#| label: fig-simulations-rbo
+#| fig-env: figure*
+#| cache: true
+#| fig-width: 6.75
+#| fig-height: 3
+#| fig-pos: tb
+#| fig-cap: "Simulated rank biased overlap between simulated server similarity ranks varied by the number of tags reported by each server and the number of servers that report data. The baseline uses 256 tags."
+library(tidyverse)
+library(arrow)
+simulations <- arrow::read_ipc_file(here::here("data/scratch/simulation_rbo.feather"))
+
+simulations %>%
+  group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>%
+  mutate(ltags = as.integer(log2(tags))) %>%
+  ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) +
+  geom_boxplot() +
+  facet_wrap(~servers, nrow=1) +
+  #scale_y_continuous(limits = c(0, 1)) +
+  labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") +
+  theme_minimal() + theme(legend.position = "none")
+```
--- a/notebooks/revisions.qmd
+++ b/notebooks/revisions.qmd
@ -3,7 +3,7 @@ title: Revisions and Response
 author: Carl Colglazier
 ---

-Provide background for the recommendation system
+**Provide background for the recommendation system**

 > -   Identify key examples of the kinds of systems/features the one you have proposed/created aligns with.
 >
@ -11,7 +11,7 @@ Provide background for the recommendation system

 I added two sections to the background section of the text which describe recommender systems/collaborative filtering and trade-offs in different methods of evaluation. This system connects with prior work from HCI researchers, e.g. in the GroupLens lab, to build discovery and recommender systems.

-## Elaborate the design rationale for the system in the text.
+**Elaborate the design rationale for the system in the text.**

 > -   Why recommend small/specific servers?
 >
@ -19,7 +19,17 @@ I added two sections to the background section of the text which describe recomm

 In addition to the previous survivor models, I added a logistic regression model based on a continuous measure of server "generality" to support the decision to steer newcomers toward more topic-based and smaller servers. Future work can look at specific users to see if engagement with hashtags and local timelines is indicative of better retention.

-TODO: key related work
+I added more information to the...
+
+**Spell out some "user stories"/examples and use these to illuminate system performance/constraints**
+
+> + Elaborate the hci.social example in the text
+>
+> + Identify some hypothetical users/interests and explore system outputs/performance for each
+>
+> + Identify/elaborate some of the tradeoffs/constraints (e.g., emacs + gardening; privacy/trust/safety of tags; need for opt-in)
+
+I added a description of what the hci.social example illustrates (clusters of servers with related topics of focus). In the [appendix](https://files.carl.cx/junior-sheer/#user-stories), I listed a few starter "user stories" and descried their performance. I have elaborated further about tradeoffs and design choices in the "Constraints and Evaluation" subsection.

 ## Address system evaluation more directly in the paper

@ -27,6 +37,10 @@ TODO: key related work
 >
 > -   Elaborate/justify additional system evaluation plans (e.g., usability; robustness to dropping servers/tags; balancing tradeoffs; navigating privacy/trust/safety concerns)

+I have added more background information on evaluation and added fruther explainations for the current evaluation within the paper as well as plan for future work and improvments in this area.
+
 ## Clearly identify the research/design contributions of this system

-> -   Both at present and assuming your proposed development plans move forward
+> -   Both at present and assuming your proposed development plans move forward
+
+I have added more information to the discussion section to clearly identify the research and design contributions of this system.
--- a/references.bib
+++ b/references.bib
@ -67,6 +67,23 @@
  langid = {american}
 }

+@article{colglazierEffectsGroupSanctions2024,
+  title = {The {{Effects}} of {{Group Sanctions}} on {{Participation}} and {{Toxicity}}: {{Quasi-experimental Evidence}} from the {{Fediverse}}},
+  shorttitle = {The {{Effects}} of {{Group Sanctions}} on {{Participation}} and {{Toxicity}}},
+  author = {Colglazier, Carl and TeBlunthuis, Nathan and Shaw, Aaron},
+  year = {2024},
+  month = may,
+  journal = {Proceedings of the International AAAI Conference on Web and Social Media},
+  volume = {18},
+  pages = {315--328},
+  issn = {2334-0770},
+  doi = {10.1609/icwsm.v18i1.31316},
+  urldate = {2024-06-02},
+  abstract = {Online communities often overlap and coexist, despite incongruent norms and approaches to content moderation. When communities diverge, decentralized and federated communities may pursue group-level sanctions, including defederation (disconnection) to block communication between members of specific communities. We investigate the effects of defederation in the context of the Fediverse, a set of decentralized, interconnected social networks with independent governance. Mastodon and Pleroma, the most popular software powering the Fediverse, allow administrators on one server to defederate from another. We use a difference-in-differences approach and matched controls to estimate the effects of defederation events on participation and message toxicity among affected members of the blocked and blocking servers. We find that defederation causes a drop in activity for accounts on the blocked servers, but not on the blocking servers. Also, we find no evidence of an effect of defederation on message toxicity.},
+  copyright = {Copyright (c) 2024 Association for the Advancement of Artificial Intelligence},
+  langid = {english}
+}
+
@inproceedings{cosleySuggestBotUsingIntelligent2007,
  title = {{{SuggestBot}}: {{Using Intelligent Task Routing}} to {{Help People Find Work}} in {{Wikipedia}}},
  shorttitle = {{{SuggestBot}}},
@ -167,6 +184,23 @@
  keywords = {API,computational,Facebook,social media,Twitter}
 }

+@article{gehlDigitalCovenantNoncentralized2023,
+  title = {The Digital Covenant: Non-Centralized Platform Governance on the Mastodon Social Network},
+  shorttitle = {The Digital Covenant},
+  author = {Gehl, Robert W. and Zulli, Diana},
+  year = {2023},
+  month = dec,
+  journal = {Information, Communication \& Society},
+  volume = {26},
+  number = {16},
+  pages = {3275--3291},
+  publisher = {Routledge},
+  issn = {1369-118X},
+  doi = {10.1080/1369118X.2022.2147400},
+  urldate = {2024-05-31},
+  keywords = {Alternative social media,federalist political theory,mastodon,platform governance,social media}
+}
+
@article{gillespieContentModerationAI2020,
  title = {Content Moderation, {{AI}}, and the Question of Scale},
  author = {Gillespie, Tarleton},
@ -182,7 +216,7 @@
  urldate = {2021-09-28},
  abstract = {AI seems like the perfect response to the growing challenges of content moderation on social media platforms: the immense scale of the data, the relentlessness of the violations, and the need for human judgments without wanting humans to have to make them. The push toward automated content moderation is often justified as a necessary response to the scale: the enormity of social media platforms like Facebook and YouTube stands as the reason why AI approaches are desirable, even inevitable. But even if we could effectively automate content moderation, it is not clear that we should.},
  langid = {english},
-  keywords = {Artificial intelligence,bias,content moderation,platforms,scale,social media}
+  keywords = {Artificial intelligence,bias,content moderation,digital platform,platforms,scale,social media}
 }

@inproceedings{heFlockingMastodonTracking2023,
@ -349,12 +383,14 @@
  keywords = {aspect model,cold-start,collaborative filtering,information filtering,three-way aspect model,triadic aspect model}
 }

-@misc{masnickProtocolsNotPlatforms,
+@techreport{masnickProtocolsNotPlatforms2019,
  title = {Protocols, {{Not Platforms}}: {{A Technological Approach}} to {{Free Speech}}},
  shorttitle = {Protocols, {{Not Platforms}}},
  author = {Masnick, Mike},
+  year = {2019},
+  month = aug,
+  institution = {Knight First Amendment Institute},
  urldate = {2022-04-21},
-  howpublished = {https://knightcolumbia.org/content/protocols-not-platforms-a-technological-approach-to-free-speech},
  langid = {english}
 }

@ -408,7 +444,7 @@
  langid = {english}
 }

-@inproceedings{ramanChallengesDecentralisedWeb2019a,
+@inproceedings{ramanChallengesDecentralisedWeb2019,
  title = {Challenges in the {{Decentralised Web}}: {{The Mastodon Case}}},
  shorttitle = {Challenges in the {{Decentralised Web}}},
  booktitle = {Proceedings of the {{Internet Measurement Conference}}},
@ -503,6 +539,15 @@
  langid = {english}
 }

+@misc{rousseauMastodonInstances2017,
+  title = {Mastodon Instances},
+  author = {Rousseau, Amaury},
+  year = {2017},
+  journal = {instances.social},
+  urldate = {2024-03-04},
+  howpublished = {https://instances.social/}
+}
+
@book{saltonIntroductionModernInformation1987,
  title = {Introduction to Modern Information Retrieval},
  author = {Salton, Gerard and McGill, Michael J.},
@ -589,15 +634,6 @@
  keywords = {Computer Science - Human-Computer Interaction,Computer Science - Social and Information Networks}
 }

-@misc{thekinrarMastodonInstances2017,
-  title = {Mastodon Instances},
-  author = {TheKinrar},
-  year = {2017},
-  journal = {instances.social},
-  urldate = {2024-03-04},
-  howpublished = {https://instances.social/}
-}
-
@misc{trienesRecommendingUsersWhom2018,
  title = {Recommending {{Users}}: {{Whom}} to {{Follow}} on {{Federated Social Networks}}},
  shorttitle = {Recommending {{Users}}},
@ -611,7 +647,7 @@
  doi = {10.48550/arXiv.1811.09292},
  urldate = {2024-03-06},
  abstract = {To foster an active and engaged community, social networks employ recommendation algorithms that filter large amounts of contents and provide a user with personalized views of the network. Popular social networks such as Facebook and Twitter generate follow recommendations by listing profiles a user may be interested to connect with. Federated social networks aim to resolve issues associated with the popular social networks - such as large-scale user-surveillance and the miss-use of user data to manipulate elections - by decentralizing authority and promoting privacy. Due to their recent emergence, recommender systems do not exist for federated social networks, yet. To make these networks more attractive and promote community building, we investigate how recommendation algorithms can be applied to decentralized social networks. We present an offline and online evaluation of two recommendation strategies: a collaborative filtering recommender based on BM25 and a topology-based recommender using personalized PageRank. Our experiments on a large unbiased sample of the federated social network Mastodon shows that collaborative filtering approaches outperform a topology-based approach, whereas both approaches significantly outperform a random recommender. A subsequent live user experiment on Mastodon using balanced interleaving shows that the collaborative filtering recommender performs on par with the topology-based recommender.},
-  archiveprefix = {arxiv},
+  archiveprefix = {arXiv},
  keywords = {Computer Science - Information Retrieval,Computer Science - Social and Information Networks}
 }

--- a/renv.lock
+++ b/renv.lock
--- a/renv/activate.R
+++ b/renv/activate.R
@ -2,11 +2,13 @@
 local({

  # the requested version of renv
-  version <- "1.0.3"
+  version <- "1.0.7"
  attr(version, "sha") <- NULL

  # the project directory
-  project <- getwd()
+  project <- Sys.getenv("RENV_PROJECT")
+  if (!nzchar(project))
+    project <- getwd()

  # use start-up diagnostics if enabled
  diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE")
@ -31,6 +33,14 @@ local({
    if (!is.null(override))
      return(override)

+    # if we're being run in a context where R_LIBS is already set,
+    # don't load -- presumably we're being run as a sub-process and
+    # the parent process has already set up library paths for us
+    rcmd <- Sys.getenv("R_CMD", unset = NA)
+    rlibs <- Sys.getenv("R_LIBS", unset = NA)
+    if (!is.na(rlibs) && !is.na(rcmd))
+      return(FALSE)
+
    # next, check environment variables
    # TODO: prefer using the configuration one in the future
    envvars <- c(
@ -50,9 +60,22 @@ local({

  })

-  if (!enabled)
+  # bail if we're not enabled
+  if (!enabled) {
+
+    # if we're not enabled, we might still need to manually load
+    # the user profile here
+    profile <- Sys.getenv("R_PROFILE_USER", unset = "~/.Rprofile")
+    if (file.exists(profile)) {
+      cfg <- Sys.getenv("RENV_CONFIG_USER_PROFILE", unset = "TRUE")
+      if (tolower(cfg) %in% c("true", "t", "1"))
+        sys.source(profile, envir = globalenv())
+    }
+
    return(FALSE)

+  }
+
  # avoid recursion
  if (identical(getOption("renv.autoloader.running"), TRUE)) {
    warning("ignoring recursive attempt to run renv autoloader")
@ -108,6 +131,21 @@ local({
  
  }
  
+  heredoc <- function(text, leave = 0) {
+  
+    # remove leading, trailing whitespace
+    trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text)
+  
+    # split into lines
+    lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]]
+  
+    # compute common indent
+    indent <- regexpr("[^[:space:]]", lines)
+    common <- min(setdiff(indent, -1L)) - leave
+    paste(substring(lines, common), collapse = "\n")
+  
+  }
+  
  startswith <- function(string, prefix) {
    substring(string, 1, nchar(prefix)) == prefix
  }
@ -610,6 +648,9 @@ local({
  
    # if the user has requested an automatic prefix, generate it
    auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA)
+    if (is.na(auto) && getRversion() >= "4.4.0")
+      auto <- "TRUE"
+  
    if (auto %in% c("TRUE", "True", "true", "1"))
      return(renv_bootstrap_platform_prefix_auto())
  
@ -801,24 +842,23 @@ local({
  
    # the loaded version of renv doesn't match the requested version;
    # give the user instructions on how to proceed
-    remote <- if (!is.null(description[["RemoteSha"]])) {
+    dev <- identical(description[["RemoteType"]], "github")
+    remote <- if (dev)
      paste("rstudio/renv", description[["RemoteSha"]], sep = "@")
-    } else {
+    else
      paste("renv", description[["Version"]], sep = "@")
-    }
  
    # display both loaded version + sha if available
    friendly <- renv_bootstrap_version_friendly(
      version = description[["Version"]],
-      sha     = description[["RemoteSha"]]
+      sha     = if (dev) description[["RemoteSha"]]
    )
  
-    fmt <- paste(
-      "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.",
-      "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.",
-      "- Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.",
-      sep = "\n"
-    )
+    fmt <- heredoc("
+      renv %1$s was loaded from project library, but this project is configured to use renv %2$s.
+      - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.
+      - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.
+    ")
    catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote)
  
    FALSE
@ -1041,7 +1081,7 @@ local({
    # if jsonlite is loaded, use that instead
    if ("jsonlite" %in% loadedNamespaces()) {
  
-      json <- catch(renv_json_read_jsonlite(file, text))
+      json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity)
      if (!inherits(json, "error"))
        return(json)
  
@ -1050,7 +1090,7 @@ local({
    }
  
    # otherwise, fall back to the default JSON reader
-    json <- catch(renv_json_read_default(file, text))
+    json <- tryCatch(renv_json_read_default(file, text), error = identity)
    if (!inherits(json, "error"))
      return(json)
  
@ -1063,14 +1103,14 @@ local({
  }
  
  renv_json_read_jsonlite <- function(file = NULL, text = NULL) {
-    text <- paste(text %||% read(file), collapse = "\n")
+    text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n")
    jsonlite::fromJSON(txt = text, simplifyVector = FALSE)
  }
  
  renv_json_read_default <- function(file = NULL, text = NULL) {
  
    # find strings in the JSON
-    text <- paste(text %||% read(file), collapse = "\n")
+    text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n")
    pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]'
    locs <- gregexpr(pattern, text, perl = TRUE)[[1]]
  
@ -1118,14 +1158,14 @@ local({
    map <- as.list(map)
  
    # remap strings in object
-    remapped <- renv_json_remap(json, map)
+    remapped <- renv_json_read_remap(json, map)
  
    # evaluate
    eval(remapped, envir = baseenv())
  
  }
  
-  renv_json_remap <- function(json, map) {
+  renv_json_read_remap <- function(json, map) {
  
    # fix names
    if (!is.null(names(json))) {
@ -1152,7 +1192,7 @@ local({
    # recurse
    if (is.recursive(json)) {
      for (i in seq_along(json)) {
-        json[i] <- list(renv_json_remap(json[[i]], map))
+        json[i] <- list(renv_json_read_remap(json[[i]], map))
      }
    }
  
--- a/renv/settings.json
+++ b/renv/settings.json
@ -1,5 +1,5 @@
 {
-  "bioconductor.version": null,
+  "bioconductor.version": "3.19",
  "external.libraries": [],
  "ignored.packages": [],
  "package.dependency.fields": [
--- a/requirements.txt
+++ b/requirements.txt
@ -3,13 +3,17 @@ asttokens==2.4.1
 comm==0.2.1
 debugpy==1.8.0
 decorator==5.1.1
+exceptiongroup==1.2.0
 executing==2.0.1
+gensim==4.3.2
+importlib-metadata==7.0.1
 ipykernel==6.29.0
-ipython==8.20.0
+ipython==8.18.1
 jedi==0.19.1
 joblib==1.3.2
 jupyter_client==8.6.0
 jupyter_core==5.7.1
+Levenshtein==0.25.0
 matplotlib-inline==0.1.6
 nest-asyncio==1.6.0
 numpy==1.26.4
@ -25,13 +29,19 @@ pure-eval==0.2.2
 Pygments==2.17.2
 python-dateutil==2.8.2
 pyzmq==25.1.2
+rapidfuzz==3.6.1
+rbo==0.1.3
 scikit-learn==1.4.0
+scikit-surprise==1.1.4
 scipy==1.12.0
-setuptools==69.0.3
 six==1.16.0
+smart-open==6.4.0
 stack-data==0.6.3
+textdistance==4.6.1
 threadpoolctl==3.2.0
 tornado==6.4
+tqdm==4.66.2
 traitlets==5.14.1
+typing_extensions==4.9.0
 wcwidth==0.2.13
-wheel==0.42.0
+zipp==3.17.0
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/all_tag_posts.py
+++ b/scripts/all_tag_posts.py
@ -0,0 +1,40 @@
+import polars as pl
+from codebase.load_accounts import read_metadata_file
+
+def read_tags_file(file: str, accounts: set[str]) -> pl.DataFrame:
+  return pl.read_ipc(file).with_columns(
+    pl.concat_str([pl.col("host"), pl.lit("_"), pl.col("acct")]).alias("account_id")
+  ).filter(
+    pl.col("account_id").is_in(accounts)
+  ).filter(#pl.col("sensitive") == False).filter(
+    pl.col("language") == "en").filter(
+    pl.col("tags").list.len() <= 8
+  ).unique(["host", "id"]).filter(
+    pl.col("host").is_in(set(metadata["server"]))
+  ).select(pl.col(["host", "acct", "id", "sensitive", "created_at", "tags"])).explode("tags").with_columns(
+    pl.col("tags").str.to_lowercase()
+  ).with_columns(
+    pl.when(pl.col("sensitive")).then(pl.col("tags") + "_sensitive").otherwise(pl.col("tags")).alias("tags")
+  ).group_by(["host", "acct", "id", "created_at"]).agg([
+    pl.col("tags")
+  ])
+
+
+metadata = read_metadata_file("data/metadata-2024-01-31.feather").select(pl.col(["server", "user_count"])).filter(pl.col("user_count") >= 100)
+accounts = pl.scan_ipc("data/scratch/all_accounts.feather").select(
+  pl.col(["server", "acct", "bot", "noindex", "followers_count", "suspended"])
+).filter(pl.col("bot") == False).filter(
+  pl.col("noindex") == False).filter(
+  pl.col("followers_count") > 1).filter(
+  pl.col("suspended").fill_null(False) == False
+  ).collect().rename({"server":"host"}).select(pl.col(["host", "acct"])).unique(["host", "acct"]).with_columns(
+  pl.concat_str([pl.col("host"), pl.lit("_"), pl.col("acct")]).alias("id")
+)
+
+all_tag_posts = pl.concat([
+  read_tags_file("data/tags_filt.feather", set(accounts["id"].to_list())),
+  read_tags_file("data/tags-202302-202308.feather", set(accounts["id"].to_list()))
+]).unique(["host", "id"])#.filter(pl.col("created_at") >= pl.date(2023, 1, 1)).filter(pl.col("created_at") < pl.date(2023, 8, 1))
+all_tag_posts.write_ipc("data/scratch/all_tag_posts.feather")
+
+#all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter(pl.col("created_at") >= pl.date(2023, 2, 1)).filter(pl.col("created_at") < pl.date(2023, 8, 1))
--- a/scripts/similar_servers.py
+++ b/scripts/similar_servers.py
@ -0,0 +1,28 @@
+# Generates a list of similar servers
+import polars as pl
+import scipy
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from FediRecommender import TagData, built_tfidf_matrix
+
+server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
+
+# Create data object
+td = TagData(servers=server_samples, n_tags=256, min_server_accounts=2, data_dir='data')
+# Normalize data using BM
+tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)#.filter(pl.col("accounts") / pl.col("D") > 0.0001)
+baseline_host_to_index = td.host_to_index
+full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T
+m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one
+baseline_similarlity = cosine_similarity(m)
+l = []
+for i in range(np.shape(baseline_similarlity)[0] - 1):
+  l.append(
+    pl.DataFrame({
+      "Source": list(td.host_to_index.keys())[i],
+      "Target": list(td.host_to_index.keys())[i+1:],
+      "Similarity": baseline_similarlity[i][i+1:]
+    })
+  )
+similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0)
+similarity_df.write_ipc("data/scratch/server_similarity.feather")