A bunch of things.

2024-02-27 14:08:11 -06:00 · 2024-02-27 14:08:11 -06:00 · df0cc8e8ec
commit df0cc8e8ec
parent 6a83cc2c24
30 changed files with 5599 additions and 873 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,9 @@ data/
 .DS_Store
 index_cache/
 index_files/
+*_cache/
+*_files/
+*.pdf.md

 # R stuff
 .Rproj.user
--- a/_quarto.yml
+++ b/_quarto.yml
@ -2,16 +2,25 @@ project:
  type: manuscript

 manuscript:
-  article: index.qmd
-
+  article: article.qmd
+  code-links:
+    - text: Preprocessing
+      href: code/preprocess.py
+    - text: R code
+      href: code/helpers.R
+    - href: code/survival.R
+  notebooks: 
+  #  - notebook: _tags.qmd
+  #  - notebook: _pull_pull.qmd
+    - notebook: notebooks/_moved.qmd
+  #  - notebook: Presentation.qmd
+  environment: renv.lock
 format:
  html:
    comments:
      hypothesis: true
-  #docx: default
-  #jats: default
-  # (other formats)
-  pdf: default
+  pdf:
+    template: template.tex

 execute:
  freeze: true
--- a/aaai24.bst
+++ b/aaai24.bst
--- a/aaai24.sty
+++ b/aaai24.sty
@ -0,0 +1,303 @@
+\NeedsTeXFormat{LaTeX2e}%
+\ProvidesPackage{aaai24}[2023/06/26 AAAI 2024 Submission format]%
+\def\year{2024}%
+\typeout{Conference Style for AAAI for LaTeX 2e -- version for submission}%
+%
+\def\copyright@on{T}
+\def\showauthors@on{T}
+\def\nocopyright{\gdef\copyright@on{}} % Copyright notice is required for camera-ready only.
+\DeclareOption{submission}{%
+  \gdef\copyright@on{}%
+  \gdef\showauthors@on{}%
+  \long\gdef\pdfinfo #1{\relax}%
+}%
+\ProcessOptions\relax%
+% WARNING: IF YOU ARE USING THIS STYLE SHEET FOR AN AAAI PUBLICATION, YOU
+% MAY NOT MODIFY IT FOR ANY REASON. MODIFICATIONS (IN YOUR SOURCE
+% OR IN THIS STYLE SHEET WILL RESULT IN REJECTION OF YOUR PAPER).
+%
+% WARNING: This style is NOT guaranteed to work. It is provided in the
+% hope that it might make the preparation of papers easier, but this style
+% file is provided "as is" without warranty of any kind, either express or
+% implied, including but not limited to the implied warranties of
+% merchantability, fitness for a particular purpose, or noninfringement.
+% You use this style file at your own risk. Standard disclaimers apply.
+% There are undoubtably bugs in this style. If you would like to submit
+% bug fixes, improvements, etc. please let us know. Please use the contact form
+% at www.aaai.org.
+%
+% Do not use this file unless you are an experienced LaTeX user.
+%
+% PHYSICAL PAGE LAYOUT
+\setlength\topmargin{-0.25in} \setlength\oddsidemargin{-0.25in}
+\setlength\textheight{9.0in} \setlength\textwidth{7.0in}
+\setlength\columnsep{0.375in} \newlength\titlebox \setlength\titlebox{2.25in}
+\setlength\headheight{0pt}  \setlength\headsep{0pt}
+%\setlength\footheight{0pt}  \setlength\footskip{0pt}
+\thispagestyle{empty} \pagestyle{empty}
+\flushbottom \twocolumn \sloppy
+% We're never going to need a table of contents, so just flush it to
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+% gf: PRINT COPYRIGHT NOTICE
+\def\copyright@year{\number\year}
+\def\copyright@text{Copyright \copyright\space \copyright@year,
+Association for the Advancement of Artificial Intelligence (www.aaai.org).
+All rights reserved.}
+\def\copyrighttext#1{\gdef\copyright@on{T}\gdef\copyright@text{#1}}
+\def\copyrightyear#1{\gdef\copyright@on{T}\gdef\copyright@year{#1}}
+% gf: End changes for copyright notice (used in \maketitle, below)
+% Title stuff, taken from deproc.
+%
+\def\maketitle{%
+  \par%
+  \begingroup % to make the footnote style local to the title
+    \def\thefootnote{\fnsymbol{footnote}}
+    \twocolumn[\@maketitle] \@thanks%
+  \endgroup%
+  % Insert copyright slug unless turned off
+  \if T\copyright@on\insert\footins{\noindent\footnotesize\copyright@text}\fi%
+  %
+  \setcounter{footnote}{0}%
+  \let\maketitle\relax%
+  \let\@maketitle\relax%
+  \gdef\@thanks{}%
+  \gdef\@author{}%
+  \gdef\@title{}%
+  \let\thanks\relax%
+}%
+\long\gdef\affiliations #1{ \def \affiliations_{\if T\showauthors@on#1\fi}}%
+%
+\def\@maketitle{%
+  \def\theauthors{\if T\showauthors@on\@author\else Anonymous submission\fi}
+  \newcounter{eqfn}\setcounter{eqfn}{0}%
+  \newsavebox{\titlearea}
+  \sbox{\titlearea}{
+    \let\footnote\relax\let\thanks\relax%
+    \setcounter{footnote}{0}%
+    \def\equalcontrib{%
+      \ifnum\value{eqfn}=0%
+        \footnote{These authors contributed equally.}%
+        \setcounter{eqfn}{\value{footnote}}%
+      \else%
+        \footnotemark[\value{eqfn}]%
+      \fi%
+    }%
+    \vbox{%
+      \hsize\textwidth%
+      \linewidth\hsize%
+      \vskip 0.625in minus 0.125in%
+      \centering%
+      {\LARGE\bf \@title \par}%
+      \vskip 0.1in plus 0.5fil minus 0.05in%
+      {\Large{\textbf{\theauthors\ifhmode\\\fi}}}%
+      \vskip .2em plus 0.25fil%
+      {\normalsize \affiliations_\ifhmode\\\fi}%
+      \vskip .5em plus 2fil%
+    }%
+  }%
+%
+  \newlength\actualheight%
+  \settoheight{\actualheight}{\usebox{\titlearea}}%
+  \ifdim\actualheight>\titlebox%
+    \setlength{\titlebox}{\actualheight}%
+  \fi%
+%
+  \vbox to \titlebox {%
+    \let\footnote\thanks\relax%
+    \setcounter{footnote}{0}%
+    \def\equalcontrib{%
+      \ifnum\value{eqfn}=0%
+        \footnote{These authors contributed equally.}%
+        \setcounter{eqfn}{\value{footnote}}%
+      \else%
+        \footnotemark[\value{eqfn}]%
+      \fi%
+    }%
+    \hsize\textwidth%
+    \linewidth\hsize%
+    \vskip 0.625in minus 0.125in%
+    \centering%
+    {\LARGE\bf \@title \par}%
+    \vskip 0.1in plus 0.5fil minus 0.05in%
+    {\Large{\textbf{\theauthors\ifhmode\\\fi}}}%
+    \vskip .2em plus 0.25fil%
+    {\normalsize \affiliations_\ifhmode\\\fi}%
+    \vskip .5em plus 2fil%
+  }%
+}%
+%
+\renewenvironment{abstract}{%
+  \centerline{\bf Abstract}%
+  \vspace{0.5ex}%
+  \setlength{\leftmargini}{10pt}%
+  \begin{quote}%
+    \small%
+}{%
+  \par%
+  \end{quote}%
+  \vskip 1ex%
+}%
+% jsp added:
+\def\pubnote#1{
+  \thispagestyle{myheadings}%
+  \pagestyle{myheadings}%
+  \markboth{#1}{#1}%
+  \setlength\headheight{10pt}%
+  \setlength\headsep{10pt}%
+}%
+%
+% SECTIONS with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+-0.5ex minus -.2ex}{3pt plus 2pt minus 1pt}{\Large\bf\centering}}
+\def\subsection{\@startsection{subsection}{2}{\z@}{-2.0ex plus
+-0.5ex minus -.2ex}{3pt plus 2pt minus 1pt}{\large\bf\raggedright}}
+\def\subsubsection{\@startsection{subparagraph}{3}{\z@}{-6pt plus
+%%% DIEGO changed: 29/11/2009
+%% 2pt minus 1pt}{-1em}{\normalsize\bf}}
+-2pt minus -1pt}{-1em}{\normalsize\bf}}
+%%% END changed
+\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}{-6pt plus -2pt minus -1pt}{-1em}{\normalsize\bf}}%
+\setcounter{secnumdepth}{0}
+% add period to section (but not subsection) numbers, reduce space after
+%\renewcommand{\thesection}
+%   {\arabic{section}.\hskip-0.6em}
+%\renewcommand{\thesubsection}
+%   {\arabic{section}.\arabic{subsection}\hskip-0.6em}
+% FOOTNOTES
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt }
+\setcounter{footnote}{0}
+% LISTS AND PARAGRAPHS
+\parindent 10pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 0.5pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\leftmargin 10pt \leftmargini 13pt \leftmarginii 10pt \leftmarginiii 5pt \leftmarginiv 5pt \leftmarginv 5pt \leftmarginvi 5pt
+\labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+\labelwidth\leftmarginii\advance\labelwidth-\labelsep
+\topsep 2pt plus 1pt minus 0.5pt
+\parsep 1pt plus 0.5pt minus 0.5pt
+\itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+\labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+\topsep 1pt plus 0.5pt minus 0.5pt
+\parsep \z@
+\partopsep 0.5pt plus 0pt minus 0.5pt
+\itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+\labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+\labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+\labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip 0pt plus3pt%
+\belowdisplayshortskip 4pt plus3pt minus3pt%
+% Less leading in most fonts (due to the narrow columns)
+% The choices were between 1-pt and 1.5-pt leading
+\def\normalsize{\@setfontsize\normalsize\@xpt{11}}   % 10 point on 11
+\def\small{\@setfontsize\small\@ixpt{10}}    % 9 point on 10
+\def\footnotesize{\@setfontsize\footnotesize\@ixpt{10}}  % 9 point on 10
+\def\scriptsize{\@setfontsize\scriptsize\@viipt{10}}  % 7 point on 8
+\def\tiny{\@setfontsize\tiny\@vipt{7}}    % 6 point on 7
+\def\large{\@setfontsize\large\@xipt{12}}    % 11 point on 12
+\def\Large{\@setfontsize\Large\@xiipt{14}}    % 12 point on 14
+\def\LARGE{\@setfontsize\LARGE\@xivpt{16}}    % 14 point on 16
+\def\huge{\@setfontsize\huge\@xviipt{20}}    % 17 point on 20
+\def\Huge{\@setfontsize\Huge\@xxpt{23}}    % 20 point on 23
+
+\AtBeginDocument{%
+  \@ifpackageloaded{natbib}%
+    {%
+      % When natbib is in use, set the proper style and fix a few things
+      \let\cite\citep
+      \let\shortcite\citeyearpar
+      \setcitestyle{aysep={}}
+      \setlength\bibhang{0pt}
+      \bibliographystyle{aaai24}
+    }{}%
+  \@ifpackageloaded{hyperref}%
+    {%
+      \PackageError{aaai}{You must not use hyperref in AAAI papers.}{You (or one of the packages you imported) are importing the hyperref package, which is forbidden in AAAI papers. You must remove it from the paper to proceed.}
+    }{}%
+  \@ifpackageloaded{bbm}%
+    {%
+      \PackageError{aaai}{You must not use bbm package in AAAI papers because it introduces Type 3 fonts which are forbidden.}{See https://tex.stackexchange.com/questions/479160/a-replacement-to-mathbbm1-with-type-1-fonts for possible alternatives.}
+    }{}%
+    \@ifpackageloaded{authblk}%
+    {%
+      \PackageError{aaai}{Package authblk is forbbidden.}{Package authblk is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{balance}%
+    {%
+      \PackageError{aaai}{Package balance is forbbidden.}{Package balance is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{CJK}%
+    {%
+      \PackageError{aaai}{Package CJK is forbbidden.}{Package CJK is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{flushend}%
+    {%
+      \PackageError{aaai}{Package flushend is forbbidden.}{Package flushend is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{fontenc}%
+    {%
+      \PackageError{aaai}{Package fontenc is forbbidden.}{Package fontenc is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{fullpage}%
+    {%
+      \PackageError{aaai}{Package fullpage is forbbidden.}{Package fullpage is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{geometry}%
+    {%
+      \PackageError{aaai}{Package geometry is forbbidden.}{Package geometry is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{grffile}%
+    {%
+      \PackageError{aaai}{Package grffile is forbbidden.}{Package grffile is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{navigator}%
+    {%
+      \PackageError{aaai}{Package navigator is forbbidden.}{Package navigator is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{savetrees}%
+    {%
+      \PackageError{aaai}{Package savetrees is forbbidden.}{Package savetrees is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{setspace}%
+    {%
+      \PackageError{aaai}{Package setspace is forbbidden.}{Package setspace is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{stfloats}%
+    {%
+      \PackageError{aaai}{Package stfloats is forbbidden.}{Package stfloats is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{tabu}%
+    {%
+      \PackageError{aaai}{Package tabu is forbbidden.}{Package tabu is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{titlesec}%
+    {%
+      \PackageError{aaai}{Package titlesec is forbbidden.}{Package titlesec is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{tocbibind}%
+    {%
+      \PackageError{aaai}{Package tocbibind is forbbidden.}{Package tocbibind is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{ulem}%
+    {%
+      \PackageError{aaai}{Package ulem is forbbidden.}{Package ulem is forbbiden. You must find an alternative.}
+    }{}%
+  \@ifpackageloaded{wrapfig}%
+    {%
+      \PackageError{aaai}{Package wrapfig is forbbidden.}{Package wrapfig is forbbiden. You must find an alternative.}
+    }{}%
+}
+
+\let\endthebibliography=\endlist
--- a/article.qmd
+++ b/article.qmd
@ -0,0 +1,361 @@
+---
+title: Recommending Servers on Mastodon
+short-title: Mastodon Recommendations
+authors:
+  - name: Carl Colglazier
+    affiliation:
+      name: Northwestern University
+      city: Evanston
+      state: Illinois
+      country: United States
+    corresponding: true
+bibliography: references.bib
+pdf-engine: pdflatex
+format:
+  html: default
+  pdf+icwsm:
+    fig-pos: 'ht!bp'
+    cite-method: natbib
+    template: template.tex
+    keep-md: true
+    link-citations: false
+  acm-pdf:
+    output-file: mastodon-recommendations-acm.pdf
+acm-metadata:
+  # comment this out to make submission anonymous
+  anonymous: true
+  # comment this out to build a draft version
+  #final: true
+
+  # comment this out to specify detailed document options
+  # acmart-options: sigconf, review  
+
+  # acm preamble information
+  copyright-year: 2018
+  acm-year: 2018
+  copyright: acmcopyright
+  doi: XXXXXXX.XXXXXXX
+  conference-acronym: "Conference acronym 'XX"
+  conference-name: |
+    Make sure to enter the correct
+    conference title from your rights confirmation emai
+  conference-date: June 03--05, 2018
+  conference-location: Woodstock, NY
+  price: "15.00"
+  isbn: 978-1-4503-XXXX-X/18/06
+
+  # if present, replaces the list of authors in the page header.
+  shortauthors: Colglazier
+
+  # The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
+  # Please copy and paste the code instead of the example below.
+  ccs: |
+    \begin{CCSXML}
+    <ccs2012>
+     <concept>
+      <concept_id>10010520.10010553.10010562</concept_id>
+      <concept_desc>Computer systems organization~Embedded systems</concept_desc>
+      <concept_significance>500</concept_significance>
+     </concept>
+     <concept>
+      <concept_id>10010520.10010575.10010755</concept_id>
+      <concept_desc>Computer systems organization~Redundancy</concept_desc>
+      <concept_significance>300</concept_significance>
+     </concept>
+     <concept>
+      <concept_id>10010520.10010553.10010554</concept_id>
+      <concept_desc>Computer systems organization~Robotics</concept_desc>
+      <concept_significance>100</concept_significance>
+     </concept>
+     <concept>
+      <concept_id>10003033.10003083.10003095</concept_id>
+      <concept_desc>Networks~Network reliability</concept_desc>
+      <concept_significance>100</concept_significance>
+     </concept>
+    </ccs2012>
+    \end{CCSXML}
+    
+    \ccsdesc[500]{Computer systems organization~Embedded systems}
+    \ccsdesc[300]{Computer systems organization~Redundancy}
+    \ccsdesc{Computer systems organization~Robotics}
+    \ccsdesc[100]{Networks~Network reliability}
+
+  keywords:
+    - decentralized online social networks
+abstract: |
+  When trying to join the Fediverse, a decentralized collection of interoperable social networking websites, new users face the dillema of choosing a home server. Using trace data from millions of new Fediverse accounts, we show that new accounts on the flagship server are less likely to remain active and that accounts that move between servers tend to move from larger servers to smaller server. We then use the insights from our analysis to build a tool that can help new Fediverse users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effective even with limited data on each local server.
+execute:
+  echo: false
+  error: false
+  warning: false
+  message: false
+  freeze: auto
+fig-width: 6.75
+knitr:
+  opts_knit: 
+    verbose: true
+#filters: 
+#  - parse-latex
+---
+
+# Introduction
+
+The Fediverse has emerged as a viable alternative to corporate, centralized social media such as Twitter and Reddit. Over the course of the last two years, millions of people have set up new accounts, significantly increasing the size of the network. In the wake of Elon Musk's Twitter aquisition, Mastodon, a popular Fediverse software which offers a Twitter-like experience, saw in increase in activity and scrutiny.
+
+We show how the onboarding process for Mastodon has changed over time with a particular focus on the largest, flagship Mastodon server. Users who sign up to this server are less likely to remain active. Based on data from over a million Mastodon accounts, we also find that many users who move accounts tend to gravitate toward smaller, more niche servers over time.
+
+We design a potential way to create server and tag recommendations on Mastodon, which could both help newcomers find servers that match their interests and help established accounts discover "neighborhoods" of related servers.
+
+# Background
+
+## Empirical Setting
+
+The Fediverse is a set of decentralized online social networks which interoperate using shared protocols like ActivityPub. Mastodon is a software program used by many Fediverse servers and offers a user experience similar to the Tweetdeck client for Twitter. It was first created in late 2016 and saw a surge in interest in 2022 during and after Elon Musk's Twitter acquisition.
+
+Discovery has been challenging on Masotodon. The developers and user base tend to be skeptical of algorithmic intrusions, instead opting for timelines which only show posts in reverse chronological order. Search is also difficult. Public hashtags are searchable, but most servers have traditionally not supported searching keywords or simple strings. Accounts can only be searched using their full `username@server` form.
+
+Mastodon features a "home" timeline which shows all public posts from accounts that share the same home server. On larger servers, this timeline can be unwieldy; however, on smaller servers, this presents the opportunity to discover new posts and users of potential interest.
+
+Mastodon offers its users high levels of data portability. Users can move their accounts accross instances while retaining their follows (their post data; however, does not move with the new account). The choice of an initial instance consequentially is not irreversible.
+
+## Newcomers in Online Communities
+
+Onboarding newcomers is an important part of the lifecycle of online communities. Any community can expect a certain amount of turnover, and so it is important for the long-term health and longevity of the community to be able to bring in new members [@krautBuildingSuccessfulOnline2011 p. 182]. However, the process of onboarding newcomers is not always straightforward. Newcomers may have difficulty finding the community, understanding the norms and expectations, and finding a place for themselves within the community. This can lead to high rates of attrition among newcomers.
+
+## The Mastodon Migrations
+
+Mastodon saw a surge in interest in 2022 and 2023, particularly after Elon Musk's Twitter acquisition. In particular, four events of interests drove measurable increases in new users to the network: the announcement of the acquisition (April 14, 2022), the closing of the acquisition (October 27, 2022), a day when Twitter suspended a number of prominent journalists (December 15, 2022), and a day when Twitter experienced an outage and started rate limiting accounts (July 1, 2023). Many Twitter accounts announced they were setting up Mastodon accounts and linked their new accounts to their followers, often using tags like #TwitterMigration [@heFlockingMastodonTracking2023] and driving interest in Mastodon in a process @cavaDriversSocialInfluence2023 found consistent with social influence theory.
+
+The series of migrations of new users into Mastodon in many ways reflect folk stories of "Eternal Septembers" on previous communication networks, where a large influx of newcomers challenged the existing norms [@driscollWeMisrememberEternal2023]. Many Mastodon servers do have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Variation amoung servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. 
+
+Some media outlets have framed reports on Mastodon [@hooverMastodonBumpNow2023] through what @zulliRethinkingSocialSocial2020 calls the "Killer Hype Cycle", whereby the media finds a new alterntive social media platform, declares it a potential killer of some established platform, and laters calls it a failure if it does not displace the existing platform. Such framing fails to take systems like the Fediverse seriously for their own merits: completely replacing existing commercial systems is not the only way to measure success, nor does it account for the real value the Fediverse provides for its millions of active users.
+
+# Data
+
+```{r}
+#| label: fig-account-timeline
+#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
+#| fig-height: 3
+#| fig-width: 6.75
+#| fig-env: figure*
+#| fig-pos: htb!
+
+library(here)
+source(here("code/helpers.R"))
+account_timeline_plot()
+```
+
+**Mastodon Profiles**: We collected accounts using data previously collected from posts on public Mastodon timelines from October 2020 to January 2024. We then queried for up-to-date information on those accounts including their most recent status and if the account had moved. This gave us a total of N accounts. Note that because we got updated information on each account, we include only accounts on servers which still exist and which returned records for the account.
+
+**Moved Profiles**: We found a subset of N accounts which had moved from one server to another.
+
+**Tags**: We collect N posts which contained between 2 and 5 hashtags.
+
+# Analysis and Results
+
+## Competition Among Servers in Attracting Newcomers
+
+_How does mastodon.social factor into the aggregate Mastodon onboarding process?_
+
+::: {#fig-mastodon-online-signup-disabled width=50% .content-visible when-format="html"}
+
+
+![](images/mastodon-social-signups-2020-11-01.png){fig-env="figure" width=6cm height=6cm}
+
+The main page of mastodon.social as viewed by a logged out web browser on November 1, 2020. The sign-up form is blurred out and instead there is a message suggesting to either sign up on mastodon.online or see a list of servers accepting new accounts at joinmastodon.org.
+
+:::
+
+Throughout its history, Mastodon's flagship server, mastodon.social, has allowed and disallowed open sign-ups at various times. When the website did not allow sign-ups, it displayed a message redirecting those interested in signing up for an account to mastodon.social or alternatively to a list of potential servers at joinmastodon.com.
+
+We found three main periods during which mastodon.social did not accept new signups by first noting the times in @fig-account-timeline where the proportion of new accounts on mastodon.social drops to zero. We then used the Internet Archive to verify that signups were disabled during these periods.
+
+1. An extended period of through the end of October 2020.
+
+2. A temporary issue when the email host limited the server in mid-2022.
+
+3. Two periods in late 2022 and early 2023.
+
+We construct an interrupted time series using an autoregressive integrated moving average (ARIMA) model for sign-ups on mastodon.social, the servers linked in joinmastodon.org, and all other servers. For the first period, we also include mastodon.online since mastodon.social linked to it directly during that time.
+
+::: {.content-visible when-format="html"}
+
+$$
+\begin{aligned}
+y_t &= \beta_0 + \beta_1 \text{open}_t + \beta_2 \text{day}_t + \beta_3 (\text{open} \times \text{day})_t \\
+&\quad + \beta_4 \sin\left(\frac{2\pi t}{7}\right) + \beta_5 \cos\left(\frac{2\pi t}{7}\right) \\
+&\quad + \beta_6 \sin\left(\frac{4\pi t}{7}\right) + \beta_7 \cos\left(\frac{4\pi t}{7}\right) \\
+&\quad + \phi_1 y_{t-1} + \phi_2 y_{t-2} + \epsilon_t
+\end{aligned}
+$$
+
+where $y_t$ is the number of new accounts on a server at time $t$, $\text{open}_t$ is a binary variable indicating if the server is open to new sign-ups, $\text{day}_t$ is an increasing integer represnting the date, and $\epsilon_t$ is a white noise error term. We use the sine and cosine terms to account for weekly seasonality.
+
+
+| Period     | Setting | Significant |
+|------------|:----------------|:----|
+| 2020-2021  | mastodon.online | Yes |
+|            | JoinMastodon    | No  |
+|            | Other           | No  |
+| Mid 2022   | JoinMastodon    | No  |
+|            | Other           | No  |
+| Early 2022 | JoinMastodon    | No  |
+|            | Other           | No  |
+
+: Results from ARIMA models for the number of new accounts on mastodon.social, mastodon.online, servers linked in joinmastodon.org, and all other servers.
+
+:::
+
+::: {.content-visible when-format="pdf+icwsm}
+
+```{=latex}
+\begin{table}[!ht]
+    \centering
+    \begin{tabular}{|l|l|l|}
+    \hline
+        Period & Setting & Significant \\ \hline
+        2020-2021 & mastodon.online & Yes \\ \hline
+        ~ & JoinMastodon & No \\ \hline
+        ~ & Other & No \\ \hline
+        Mid 2022 & JoinMastodon & No \\ \hline
+        ~ & Other & No \\ \hline
+        Early 2022 & JoinMastodon & No \\ \hline
+        ~ & Other & No \\ \hline
+    \end{tabular}
+\end{table}
+```
+
+:::
+
+
+
+## Survival Model
+
+_Are accounts on mastodon.social less likely to remain active than accounts on other servers?_
+
+```{r, cache.extra = tools::md5sum("code/survival.R")}
+#| cache: true
+#| label: fig-survival
+#| fig-env: figure
+#| fig-cap: "Survival probabilities for accounts created during May 2023."
+#| fig-width: 3.375
+#| fig-height: 2.5
+#| fig-pos: h!
+
+library(here)
+source(here("code/survival.R"))
+plot_survival 
+```
+
+Using accounts created during May 2023, we create Kaplan–Meier estimator for the probability that an account will remain active based on whether the account is on mastodon.social or otherwise if it is on a server in the Join Mastodon list. An account is considered active if it posted a status on or after December 1, 2023 and all accounts which posted after that point are considered censored.
+
+The results suggest that accounts on mastodon.social are less likely to remain active than accounts on other servers, but there is no significant difference between accounts on servers in the Join Mastodon list and other servers.
+
+## Moved Accounts
+
+_Do accounts tend to move to larger or smaller servers?_
+
+Mastodon users can move their accounts to another server while retaining their connections (but not their posts) to other Mastodon accounts. This feature, built into the Mastodon software, offers data portability and helps avoid lock-in.
+
+```{r}
+#| label: ergm-table
+#| echo: false
+#| warning: false
+#| message: false
+#| error: false
+
+library(here)
+library(modelsummary)
+library(kableExtra)
+library(purrr)
+library(stringr)
+load(file = here("data/scratch/ergm-model-early.rda"))
+load(file = here("data/scratch/ergm-model-late.rda"))
+
+if (knitr::is_latex_output()) {
+  format <- "latex"
+} else {
+  format <- "html"
+}
+
+x <- modelsummary(
+  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
+  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
+  statistic = NULL,
+  gof_omit = ".*",
+  coef_rename = c(
+    "sum" = "(Sum)",
+    "diff.sum0.h-t.accounts" = "Smaller server",
+    "nodeocov.sum.accounts" = "Server size\n(outgoing)",
+    "nodeifactor.sum.registrations.TRUE" = "Open registrations\n(incoming)",
+    "nodematch.sum.language" = "Languages match"
+  ),
+  align="lrrrr",
+  stars = c('*' = .05, '**' = 0.01, '***' = .001),
+  output = format
+  #output = "markdown",
+  #table.envir='table*',
+  #table.env="table*"
+  ) %>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
+
+if (knitr::is_latex_output()) {
+  x %>% reduce(str_c, capture.output(.), sep="\n") %>% gsub("table", "table*", .) %>% knitr::raw_latex()
+} else {
+  x
+}
+```
+
+# Proposed Recommendation System
+
+_How can we build an opt-in, low-resource recommendation system for finding Fediverse servers?_
+
+Tailored servers focused on a particular topic and community have advantages for onboarding newcomers; however, it may be difficult for new and existing Mastodon users to discover these communities. To address this gap, we propose a recommendation system for finding new servers. This system would be opt-in and low-resource, requiring only a small amount of data from each server.
+
+First, we construct the ideal system based on observted data. That is, we use the data from all posts we collected from all servers to construct an ideal recommender. We then simulate various scenarios that limit both servers that report data and the number of tags they report. We use rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers.
+
+## Recommendation System Design
+
+We use a term frequency-inverse document frequency model to associate the top tags with each server. For the term frequency, we divide the count of the number of accounts which used the tag during the six-month period by the total number of known account-tag pairs on that server; for the inverse document frequency, we divide the total number of servers by count of servers reporting the tag. In this implimentation, we also apply filters such that tags must be used to at least five people on the server to be reported and the tag must be used by at least ten people and at least three servers in the entire known system.
+
+## Rubustness to Limited Data
+
+```{r}
+#| label: fig-simulations-rbo
+#| fig-env: figure*
+#| cache: true
+#| fig-width: 6.75
+#| fig-height: 3
+#| fig-pos: tb
+library(tidyverse)
+library(arrow)
+simulations <- arrow::read_ipc_file("data/scratch/simulation_rbo.feather")
+
+simulations %>%
+  group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>%
+  mutate(ltags = as.integer(log2(tags))) %>%
+  ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) +
+  geom_boxplot() +
+  facet_wrap(~servers, nrow=1) +
+  #scale_y_continuous(limits = c(0, 1)) +
+  labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") +
+  theme_minimal() + theme(legend.position = "none")
+```
+
+
+We simulated various scenarios that limit both servers that report data and the number of tags they report. We used rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. @fig-simulations-rbo shows how the average agreement with the baseline scales linearly with the logarithm of the tag count.
+
+# Conclusion
+
+Based on analysis of trace data from millions of new Fediverse accounts, we find evidence that suggests that servers matter and that users tend to move from larger servers to smaller servers. We then propose a recommendation system that can help new Fediverse users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effectively deployed in a federated manner, even with limited data on each local server.
+
+# References {#references}
+
+::: {.content-visible when-format="html"}
+
+# Appendix {#appendix .appendix}
+
+## Push and Pull Model
+
+{{< include notebooks/_push_pull.qmd >}}
+
+:::
--- a/code/hclust.py
+++ b/code/hclust.py
@ -0,0 +1,52 @@
+import polars as pl
+from scipy.sparse import lil_matrix
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+tf_idf = pl.read_ipc("data/scratch/tag_tfidf.feather")
+common_tags = pl.read_ipc("data/scratch/common_tags.feather")
+
+clusters = pl.read_ipc("data/scratch/tag_clusters.feather").join(
+  common_tags.rename({"tags":"tag"}), on="tag", how="inner"
+)
+
+n_clusters = tf_idf["cluster"].max() + 1
+host_to_index = {host: i for i, host in enumerate(tf_idf["host"].unique().sort().to_list())}
+cluster_names = clusters.sort("count", descending=True).unique("cluster").sort("cluster")["tag"].to_list()
+n_servers = len(host_to_index)
+
+m = lil_matrix((n_clusters, n_servers), dtype=int)
+for row in tf_idf.iter_rows(named=True):
+  m[row["cluster"], host_to_index[row["host"]]] = row["count"]
+
+sim = cosine_similarity(m.tocsr())
+
+def find_variety(sim, terms, n=20):
+  allowed_index = clusters.filter(pl.col("count") >= 2000)["cluster"].to_list()
+  if len(terms) == 0:
+    terms = [952, 800]#40, 695, 188, 791]
+  # ai, caturday, books, politics
+  for i in range(n):
+    best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0))))
+    terms.append(best_terms[0])
+  return terms
+
+def find_similar_obscure(sim, selected, n=20):
+  allowed_index = clusters.filter(pl.col("count") >= 100)["cluster"].to_list()
+  terms = selected
+  print(-np.sum(sim[terms], axis=0)[706])
+  print(-np.sum(sim[terms], axis=0))
+  for i in range(len(selected) + n):
+    ###best_terms = list(filter(lambda x: x in allowed_index and x not in terms, np.argsort(np.var(sim[terms], axis=0))))
+    best_terms = list(filter(lambda x: x in allowed_index and x not in terms,np.argsort(-np.sum(sim[terms], axis=0))))
+    terms.append(best_terms[0])
+  return terms
+
+np.array(cluster_names)[find_variety(sim, 25)]
+
+np.array(cluster_names)[find_variety_obscure(sim, [337, 1242, 1250], n=10)]
+np.array(cluster_names)[find_variety(sim, [337, 1242, 1250], n=10)]
+
+map(lambda x: zip(np.array(cluster_names)[x], x), find_variety_obscure(sim, [337, 1242, 940, 1108, 1454, 612, 260], n=10))
+
+np.array(cluster_names)[find_similar_obscure(sim, [337, 1242, 1108, 1454, 612, 260, 424], n=10)]
--- a/code/helpers.R
+++ b/code/helpers.R
@ -0,0 +1,130 @@
+library(tidyverse)
+library(arrow)
+library(here)
+library(patchwork)
+library(scales)
+
+theme_bw_small_labels <- function(base_size = 9) {
+  theme_bw(base_size = base_size) %+replace%
+    theme(
+      plot.title = element_text(size = base_size * 0.8),
+      plot.subtitle = element_text(size = base_size * 0.75),
+      plot.caption = element_text(size = base_size * 0.7),
+      axis.title = element_text(size = base_size * 0.9),
+      axis.text = element_text(size = base_size * 0.8),
+      legend.title = element_text(size = base_size * 0.9),
+      legend.text = element_text(size = base_size * 0.8)
+    )
+}
+
+load_accounts <- function(filt = TRUE) {
+  accounts_unfilt <- arrow::read_feather(
+    here("data/scratch/all_accounts.feather"),
+    col_select=c(
+      "server", "username", "created_at", "last_status_at",
+      "statuses_count", "has_moved", "bot", "suspended",
+      "following_count", "followers_count", "locked",
+      "noindex", "group", "discoverable", "limited"
+    ))
+  if (!filt) {
+    return(accounts_unfilt)
+  }
+  return(
+    accounts_unfilt %>%
+      filter(!bot) %>%
+      # TODO: what's going on here?
+      filter(!is.na(last_status_at)) %>%
+      #mutate(limited = replace_na(limited, FALSE)) %>%
+      mutate(suspended = replace_na(suspended, FALSE)) %>%
+      filter(!limited) %>%
+      # sanity check
+      filter(!suspended) %>%
+      filter(!has_moved) %>%
+      #filter(!limited) %>%
+      filter(created_at >= "2020-08-14") %>%
+      filter(created_at < "2024-01-01") %>%
+      # We don't want accounts that were created and then immediately stopped being active
+      filter(statuses_count >= 1) %>%
+      filter(last_status_at > created_at) %>%
+      mutate(active = last_status_at >= "2024-01-01") %>%
+      mutate(last_status_at_censored = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
+      mutate(active_time = difftime(last_status_at, created_at, units="days"))
+  )
+}
+
+account_timeline_plot <- function() {
+jm <- arrow::read_feather(here("data/scratch/joinmastodon.feather"))
+moved_to <- arrow::read_feather(here("data/scratch/individual_moved_accounts.feather"))
+accounts_unfilt <- arrow::read_feather(
+  here("data/scratch/all_accounts.feather"),
+  col_select=c(
+    "server", "username", "created_at", "last_status_at",
+    "statuses_count", "has_moved", "bot", "suspended",
+    "following_count", "followers_count", "locked",
+    "noindex", "group", "discoverable"
+  ))
+accounts <- accounts_unfilt %>%
+  filter(!bot) %>%
+  # TODO: what's going on here?
+  filter(!is.na(last_status_at)) %>%
+  mutate(suspended = replace_na(suspended, FALSE)) %>%
+  # sanity check
+  filter(created_at >= "2020-10-01") %>%
+  filter(created_at < "2024-01-01") %>%
+  # We don't want accounts that were created and then immediately stopped being active
+  filter(statuses_count >= 1) %>%
+  filter(last_status_at >= created_at) %>%
+  mutate(active = last_status_at >= "2024-01-01") %>%
+  mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
+  mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>%
+  #filter(!has_moved)
+acc_data <- accounts %>%
+  #filter(!has_moved) %>%
+  mutate(created_month = format(created_at, "%Y-%m")) %>%
+  mutate(created_week = floor_date(created_at, unit = "week")) %>%
+  mutate(active_now = active) %>%
+  mutate(active = active_time >= 45) %>%
+  mutate("Is mastodon.social" = server == "mastodon.social") %>%
+  mutate(jm = server %in% jm$domain) %>%
+  group_by(created_week) %>%
+  summarize(
+    `JoinMastodon Server` = sum(jm) / n(),
+    `Is mastodon.social` = sum(`Is mastodon.social`)/n(),
+    Suspended = sum(suspended)/n(),
+    Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
+    active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
+    Moved=sum(has_moved)/n(),
+    count=n()) %>%
+  pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
+
+p1 <- acc_data %>%
+  ggplot(aes(x=as.Date(created_week), group=1)) +
+  geom_line(aes(y=value, group=Measure, color=Measure)) +
+  geom_point(aes(y=value, color=Measure), size=0.7) +
+  scale_y_continuous(limits = c(0, 1.0)) +
+  labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") +
+  theme_bw_small_labels() +
+  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
+p2 <- acc_data %>%
+  distinct(created_week, count) %>%
+  ggplot(aes(x=as.Date(created_week), y=count)) +
+  geom_bar(stat="identity", fill="black") +
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2022-10-27"))),
+    linetype="dashed", color = "black") +
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2022-04-14"))),
+    linetype="dashed", color = "black") +
+  # https://twitter.com/elonmusk/status/1675187969420828672
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2022-12-15"))),
+    linetype="dashed", color = "black") +
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2023-07-01"))),
+    linetype="dashed", color = "black") +
+  #scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) +
+  scale_y_continuous(labels = scales::comma) + 
+  labs(y="Count", x="Created Week") +
+  theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week")
+return(p1 + p2 + plot_layout(ncol = 1))
+}
--- a/code/load_accounts.py
+++ b/code/load_accounts.py
@ -34,6 +34,10 @@ def read_metadata_file(f):
    pl.col("data").struct.field("email").alias("email"),
    pl.col("data").struct.field("version").alias("version"),
    pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"),
+    pl.col("data").struct.field("languages").alias("languages"),
+    pl.col("data").struct.field("registrations").alias("registrations"),
+    pl.col("data").struct.field("approval_required").alias("approval_required"),
+    pl.col("data").struct.field("invites_enabled").alias("invites_enabled"),
  )

 def read_accounts_file(f):
@ -65,6 +69,7 @@ def read_accounts_file(f):
    pl.col("data").struct.field("statuses_count"),
    pl.col("data").struct.field("last_status_at"),
    pl.col("data").struct.field("noindex"),
+    pl.col("data_string").str.contains("""\"limited\": true""").alias("limited"),
  ).with_columns(
    pl.when(
      pl.col("last_status_at").str.len_chars() > 10).then(
--- a/code/preprocess.py
+++ b/code/preprocess.py
@ -0,0 +1,99 @@
+from load_accounts import *
+from urllib.parse import urlparse
+import polars as pl
+
+def run_preprocess():
+  #accounts = pl.concat(
+  #  read_accounts_file("data/accounts.feather"),
+  #  read_accounts_file("data/account_lookup_2023.feather")
+  #)
+  accounts = read_accounts_file(
+    "data/account_lookup_compressed.feather"
+  ).unique(["account", "server"])
+  # Write a parsed accounts file for R to use
+  a = accounts.with_columns(
+    pl.col("url").map_elements(
+      lambda x: urlparse(x).netloc.encode().decode('idna')
+    ).alias("host"),
+    pl.col("data_string").str.contains("""\"moved\": \{""").alias("has_moved"),
+    pl.col("data").struct.field("suspended"),
+  )
+  
+  a_save = a.drop(["data", "data_string"])
+  a_save.select(
+    sorted(a_save.columns)
+  ).write_ipc("data/scratch/accounts.feather")
+  
+  moved_accounts = a.filter(pl.col("has_moved")).with_columns(# Do this again now we know the rows are all moved accounts
+    pl.col("data_string").str.json_decode().alias("data")
+  ).with_columns(
+    pl.col("data").struct.field("moved")
+  ).drop_nulls("moved").with_columns(
+    pl.col("moved").struct.field("acct").alias("moved_acct"),
+  ).with_columns(
+    pl.when(
+      pl.col("moved_acct").str.contains('@')
+    ).then(
+      pl.col("moved_acct").str.split('@').list.get(1)
+    ).otherwise(
+      pl.col("server")
+    ).alias("moved_server"),
+    pl.when(
+      pl.col("moved_acct").str.contains('@')
+    ).then(
+      pl.col("moved_acct").str.split('@').list.get(0)
+    ).otherwise(
+      pl.col("moved_acct")
+    ).alias("moved_acct")
+  )
+  
+  number_of_accounts = len(a)
+  
+  popular_servers = a.group_by("server").count().sort("count", descending=True)
+  
+  common_moves = moved_accounts.group_by(
+    ["server", "moved_server"]
+  ).count().sort("count", descending=True)
+  
+  common_moves.write_ipc("data/scratch/moved_accounts.feather")
+  common_moves.rename({
+    "server": "Source",
+    "moved_server": "Target",
+  }).write_csv("data/scratch/moved_accounts.csv")
+  
+  maccounts = moved_accounts.select(["account", "server", "moved_server", "moved_acct"])
+  maccounts.write_ipc("data/scratch/individual_moved_accounts.feather")
+  
+  popular_servers.write_ipc("data/scratch/popular_servers.feather")
+  
+  jm = pl.read_json("data/joinmastodon.json")
+  jm.write_ipc("data/scratch/joinmastodon.feather")
+  
+  read_metadata_file("data/metadata-2024-01-31.feather").drop(
+    ["data", "data_string"]
+  ).write_ipc("data/scratch/metadata.feather")
+  
+  read_metadata_file("data/metadata_2023-10-01.feather").drop(
+    ["data", "data_string"]
+  ).write_ipc("data/scratch/metadata-2023-10-01.feather")
+  
+  profile_accounts = read_accounts_file("data/profiles_local.feather")
+  p = profile_accounts.with_columns(
+    pl.col("url").map_elements(lambda x: urlparse(x).netloc.encode().decode('idna')).alias("host"),
+    pl.col("username").alias("account"),
+    pl.lit(False).alias("has_moved"),
+    pl.lit(False).alias("suspended"),
+  ).drop(
+    ["data", "data_string"]
+  )
+  p.select(sorted(p.columns)).write_ipc("data/scratch/accounts_processed_profiles.feather")
+  all_accounts = pl.scan_ipc(
+    [
+      "data/scratch/accounts.feather",
+      #"data/scratch/accounts_processed_recent.feather",
+      "data/scratch/accounts_processed_profiles.feather"
+    ]).collect()
+  all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "server"]).write_ipc("data/scratch/all_accounts.feather")
+
+if __name__ == "__main__":
+  run_preprocess()
--- a/code/recommender.py
+++ b/code/recommender.py
@ -0,0 +1,31 @@
+# gensim  dict
+from gensim.corpora.dictionary import Dictionary
+from gensim.models import Nmf
+
+host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join(
+  clusters, on="tag", how="inner"
+).drop("tag").join(
+  clusters, on="cluster", how="inner"
+).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([
+  pl.col("tag")
+])
+
+bow_str = host_bow_clusters["tag"].to_list()
+dict = Dictionary(bow_str)
+bow = [dict.doc2bow(x) for x in bow_str]
+
+nmf = Nmf(bow, num_topics=10)
+
+##
+#tf_idf
+host_names = tf_idf["host"].unique().sort().to_list()
+n_servers = len(host_names)
+host_name_lookup = {host_names[i]: i for i in range(n_servers)}
+n_clusters = tf_idf["cluster"].max() + 1#len(tf_idf.unique("cluster"))
+id_names = {i: clusters.unique("cluster")["tag"].to_list()[i] for i in range(n_clusters)}
+m = lil_matrix((n_clusters, n_servers), dtype=int)
+for row in tf_idf.iter_rows(named=True):
+  m[row["cluster"], host_name_lookup[row["host"]]] = row["count"]
+
+dict = Dictionary([host_names])
+nmf = Nmf(corpus=m.tocsc(), num_topics=128, id2word=id_names)
--- a/code/scratch/federated_design.py
+++ b/code/scratch/federated_design.py
@ -0,0 +1,150 @@
+import polars as pl
+from scipy.sparse import lil_matrix
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import textdistance
+from scipy.stats import kendalltau
+import rbo
+
+def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix:
+  #tag_to_index = {tag: i for i, tag in enumerate(tfidf["tags"].unique().sort().to_list())}
+  n_tags = len(tag_to_index)
+  #host_to_index = {host: i for i, host in enumerate(tfidf["host"].unique().sort().to_list())}
+  n_hosts = len(host_to_index)
+  m = lil_matrix((n_tags, n_hosts), dtype=float)
+  for row in df.iter_rows(named=True):
+    m[tag_to_index[row["tags"]], host_to_index[row["host"]]] = row["tf_idf"]
+  return m
+
+
+
+class TagData:
+  def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1):
+    self.servers = servers
+    self.n_tags = n_tags
+    all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter(
+    #all_tag_posts = read_tag_posts.filter(
+      pl.col("created_at") >= pl.date(2023, 2, 1)
+    ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter(
+      pl.col("host").is_in(servers)
+    )
+    all_tag_posts_topn = all_tag_posts.explode("tags").unique(["host", "acct", "tags"]).group_by(["host", "tags"]).agg([
+      pl.col("id").len().alias("accounts"), # How many accounts on the server are using this tag?
+    ]).sort("accounts", descending=True).with_columns(pl.lit(1).alias("counter")).with_columns(
+      pl.col("counter").cumsum().over("host").alias("running_count")
+    ).filter(pl.col("running_count") <= n_tags).drop("counter", "running_count").filter(pl.col("accounts") >= min_server_accounts)
+    self._all_tag_posts_topn = all_tag_posts_topn
+    self._server_accounts = all_tag_posts_topn.group_by("host").agg([
+      pl.sum("accounts").alias("accounts_sum"), # The total number of account-tag pairs
+    ])#.filter(pl.col("server_accounts") >= 10)
+    #self._server_accounts = all_tag_posts.unique(["host", "acct"]).group_by("host").agg([
+    #  pl.col("acct").len().alias("accounts_sum"), # The total number of accounts on the server
+    #])
+    self._most_seen_tags = self._all_tag_posts_topn.group_by("tags").agg([
+      pl.sum("accounts").alias("total_accounts"),  # account sum, how many accounts are using this tag excluding those on servers where they are the only ones
+      pl.col("accounts").len().alias("server_count") # server count, how many servers are using this tag?
+    ]).sort("server_count", descending=True)#.filter(pl.col("server_count") >= 3).filter(pl.col("total_accounts") >= 10)
+    self.tag_to_index = {tag: i for i, tag in enumerate(self._all_tag_posts_topn["tags"].unique().sort().to_list())}
+    self.host_to_index = {host: i for i, host in enumerate(self._all_tag_posts_topn["host"].unique().sort().to_list())}
+  def server_accounts(self, n=10):
+    return self._server_accounts.filter(pl.col("accounts_sum") >= n)
+  def most_seen_tags(self, n_servers=3, n_accounts=10):
+    return self._most_seen_tags.filter(pl.col("server_count") >= n_servers).filter(pl.col("total_accounts") >= n_accounts)
+  def tfidf(self, n_server_accounts=5, n_servers=3, n_accounts=10):
+    most_seen_tags = self.most_seen_tags(n_servers, n_accounts)
+    server_accounts = self.server_accounts(n_server_accounts)
+    tf = self._all_tag_posts_topn.join(
+      most_seen_tags, on="tags", how="inner"
+    ).join(
+      server_accounts, on="host", how="inner"  
+    ).with_columns(
+      (pl.col("accounts") / pl.col("accounts_sum")).alias("tf")
+    )
+    n_servers = len(self._all_tag_posts_topn.unique("host"))
+    idf = most_seen_tags.with_columns((n_servers/pl.col("server_count")).alias("idf"))
+    tfidf = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True)
+    return tfidf
+
+# Constraint: What if we only consider the _top_ 100 tags from each server?
+
+
+# Server clusters work quite well!
+
+# Tag clusters?
+#tag_simiarlity = cosine_similarity(full_mat.tocsr())
+#tag_simiarlity[td.tag_to_index["ai"]]
+#np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["ai"]])][0:10]
+#np.array(list(td.tag_to_index.keys()))[np.argsort(-tag_simiarlity[td.tag_to_index["mastoart"]])][0:10]
+#baseline = np.argsort(-host_simiarlity[host_to_index["hci.social"]])
+
+def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData):
+  baseline_keys = set(baseline_td.host_to_index.keys())
+  server_samples = set(host_list.filter(
+      pl.col("host").is_in(baseline_keys)
+    ).sample(n=n_servers-1)["host"].to_list())
+  server_is = [baseline_td.host_to_index[i] for i in server_samples]
+  sampled_server_indices = np.array(server_is)
+  tagdata = TagData(server_samples, n_tags, min_server_accounts=5)
+  tfidf = tagdata.tfidf(n_server_accounts=5, n_servers=3, n_accounts=10)#n_server_accounts=0, n_servers=2, n_accounts=1)
+  m = built_tfidf_matrix(tfidf, baseline_td.tag_to_index, baseline_td.host_to_index)
+  host_sim = cosine_similarity(m.tocsr().T)
+  rs = []
+  for serv in server_samples:
+    comp_server_index = baseline_td.host_to_index[serv]
+    bl = np.argsort(-baseline[comp_server_index][sampled_server_indices])
+    comparison = np.argsort(-host_sim[comp_server_index][sampled_server_indices])
+    reference_ranks = {x: i for i, x in enumerate(bl)}
+    current_ranks = [reference_ranks[x] for x in comparison]
+    r = rbo.RankingSimilarity(list(range(len(current_ranks)))[1:], current_ranks[1:]).rbo(p=0.80, k=16, ext=True)
+    rs.append(r)
+  return rs
+
+def run_simulations():
+  #read_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather")
+  server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
+  td = TagData(server_samples, 1_000_000, min_server_accounts=5)
+  tfidf = td.tfidf()
+  baseline_host_to_index = td.host_to_index
+  full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index)
+  baseline_similarlity = cosine_similarity(full_mat.tocsr().T)
+  #np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["hci.social"]])][0:10]
+  #np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["urbanists.social"]])][0:10]
+  host_list = pl.scan_ipc(
+    "data/scratch/all_tag_posts.feather"
+    ).select("host").unique().collect()
+  runs = []
+  for server_sizes in [256, 128, 64, 32]: # 
+    for tag_counts in [4096, 2048, 1028, 512, 256, 128, 64, 32, 16, 8, 4]:
+      for run in range(128):
+        print(server_sizes, tag_counts, run)
+        s = sampler(host_list, server_sizes, tag_counts, baseline_similarlity, td)
+        runs.append(pl.DataFrame({"servers": server_sizes, "tags": tag_counts, "run": run, "rbo": s}))
+        print(np.mean(s))
+  all_runs = pl.concat(runs)
+  all_runs.write_ipc("data/scratch/simulation_rbo.feather")
+
+
+jm = pl.read_json("data/joinmastodon-2023-08-25.json")
+jm_servers = set(jm["domain"].unique().to_list())
+jm_td = td = TagData(jm_servers, 32, min_server_accounts=5)
+jm_tfidf = jm_td.tfidf(n_server_accounts=5, n_servers=3, n_accounts=10)
+mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index)
+similarlity = cosine_similarity(mat.tocsr().T)
+tag_sm = cosine_similarity(mat.tocsr())
+tag_index_included = (np.sum(tag_sm, axis=0) > 0)
+included_tag_strings = np.array(list(jm_td.tag_to_index.keys()))[tag_index_included]
+tag_sm_matrix = tag_sm[np.ix_(tag_index_included, tag_index_included)]
+# import Affinity Prop
+from sklearn.cluster import AffinityPropagation
+ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm_matrix)
+clusters = pl.DataFrame({"tag": included_tag_strings, "cluster": ap.labels_})
+# select a random element from each cluster
+clusters.group_by("cluster").agg([pl.col("tag").shuffle().first().alias("tag")]).sort("cluster")["tag"].to_list()
+
+example_topics = ["tech", "linux", "hacking", "gamedev"]
+example_indices = [s in example_topics for s in included_tag_strings]
+similar_servers = cosine_similarity(np.array(example_indices).reshape(-1,1).T, mat[np.ix_(tag_index_included)].T)
+np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similar_servers[0])][0:10]
+
+#np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similarlity[jm_td.host_to_index["historians.social"]])][0:10]
+#np.array(list(jm_td.host_to_index.keys()))[np.where(np.sum(mat, axis=0) < 0.01)[1]]
--- a/code/scratch/gensim.py
+++ b/code/scratch/gensim.py
@ -0,0 +1,139 @@
+
+host_bow_clusters = all_tag_posts_filtered.explode("tags").rename({"tags":"tag"}).join(
+  clusters, on="tag", how="inner"
+).drop("tag").join(
+  cluster_names, on="cluster", how="inner"
+).drop("cluster").unique(["host", "id", "tag"]).group_by("host").agg([
+  pl.col("tag")
+])
+
+bow_str = host_bow_clusters["tag"].to_list()
+dict = Dictionary(bow_str)
+bow = [dict.doc2bow(x) for x in bow_str]
+lsi_model = LsiModel(bow, id2word=dict, num_topics=100)
+lsi_vectors = [lsi_model[doc] for doc in bow]
+
+lsi_model.print_topics()
+
+topic_matrix = lsi_model.get_topics()
+topic_matrix_bool = topic_matrix > 0
+
+cluster_names.filter(pl.col("tag") == "vote")
+cluster_name_list = cluster_names["tag"].to_list()
+
+topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
+
+np.array(cluster_names["tag"])[np.argsort(-np.sum(np.abs(topic_matrix), axis=0))]
+
+from scipy.spatial.distance import pdist, squareform
+distances = squareform(pdist(np.transpose(topic_matrix), 'euclidean'))
+
+np.clip(topic_matrix, min=0)
+
+def opposite_sum(a):
+  return np.sum(np.where(a > 0, a, 0)) - np.sum(np.where(a < 0, a, 0))
+
+def get_information_gain(model, dict, word: str, words_yes: list[str], words_no: list[str]):
+  words_yes = words_yes + [word]
+  words_no = words_yes #+ [word]
+  yes = np.array([x[1] for x in model[dict.doc2bow(words_yes)]])
+  no = np.array([x[1] for x in model[dict.doc2bow(words_no)]])
+  #if len(words_no) == 1:
+  #  return np.sum(opposite_sum(yes))
+  if len(words_yes) == 1:
+    return np.sum(opposite_sum(no))
+  return np.sum(np.abs(yes - no))
+
+get_information_gain(lsi_model, dict, "turkey", [], [])
+
+ig = [get_information_gain(lsi_model, dict, [x]) for x in cluster_name_list]
+
+u_matrix = lsi_model.projection.u  # U matrix (document-topic matrix)
+singular_values = lsi_model.projection.s  # Singular values
+
+# Idea: construct two lists: one of selected topics, one of "anti-topics!"
+picked_topics = []
+unpicked_topics = ["movies", "genealogy", "nfl", "horror", "aiart", "media"]
+ig = []
+for x in cluster_name_list:
+  if (x not in picked_topics + unpicked_topics):
+    ig.append(get_information_gain(lsi_model, dict, x, picked_topics, unpicked_topics))
+  else:
+    ig.append(0)
+
+cluster_name_list[np.argmax(ig)]
+
+from gensim.similarities import Similarity, MatrixSimilarity
+
+my_profile = dict.doc2bow([
+  "polars", "fediverse", "mastodon", "quartopub", "influencer", "julia", "introduction",
+  "chicago"
+])
+
+x = pl.scan_ipc("data/tags-2020-2022.feather").filter(
+  pl.col("host") == "elekk.xyz"
+).filter(pl.col("acct") == "hoppet").collect()
+
+y = x.explode("tags").with_columns(pl.col("tags").str.to_lowercase()).rename({"tags":"tag"}).join(
+  clusters, on="tag", how="inner"
+).drop("tag").join(
+  cluster_names, on="cluster", how="inner"
+).drop("cluster")
+
+my_profile = dict.doc2bow(y["tag"].to_list())
+
+my_vector = [x[1] for x in lsi_model[my_profile]]
+
+def build_host_topic_matrix(v):
+  # input is a list of lists where we want to do nested x[1]
+  a = []
+  for x in v:
+    a.append([y[1] for y in x])
+  return a
+
+similarities = cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))[0]
+sim_df = pl.DataFrame({
+  "host": host_bow_clusters["host"],
+  "similarity": similarities
+}).sort("similarity", descending=True)
+
+#[cosine_similarity(my_vector, x) for x in build_host_topic_matrix(lsi_vectors)]
+#cosine_similarity([my_vector], build_host_topic_matrix(lsi_vectors))
+
+#  topic_matrix_bool = topic_matrix > 0
+#  topic_cum_weights = np.sum(np.where(topic_matrix < 0, 0, topic_matrix), axis=0) - np.sum(np.where(topic_matrix > 0, 0, topic_matrix), axis=0)
+#  cluster_name_list = cluster_names["tag"].to_list()
+#  return np.array(cluster_name_list)[np.argsort(-topic_cum_weights)]
+
+
+#n_components, comp_labels = scipy.sparse.csgraph.connected_components(S, directed=False)
+####
+#m, tag_index = document_matrix(common_tags["tags"].to_list(), all_tag_posts_filtered["tags"].to_list())
+
+###
+#dict = Dictionary(all_tag_posts_filtered["tags"].to_list())
+####all_tag_posts_filtered["tags"].to_list()
+#bow = [dict.doc2bow(x) for x in all_tag_posts_filtered["tags"].to_list()]#[0:100000]]
+#tf_idf_model = TfidfModel(bow)#, dictionary=dict)
+#tf_idf = [tf_idf_model[doc] for doc in bow]
+#hdp = HdpModel(bow, dict)
+
+###
+# tf-idf on m
+#from sklearn.feature_extraction.text import TfidfTransformer
+#posts_tf_idf = TfidfTransformer().fit_transform(m)
+"""
+pairs = all_tag_posts_filtered.with_columns(pl.col("tags").map_elements(pairwise_sets).alias("pairs")).explode("pairs").select(pl.col(["host", "acct", "pairs"])).unique()
+pairs_counts = pairs.group_by("pairs").count().sort("count", descending=True).with_columns(
+  pl.col("pairs").map_elements(lambda x: x.split(",")).alias("pairs")
+)
+total_pairs = len(pairs)
+df = pairs_counts.with_columns(
+  pl.col("pairs").list.get(0).alias("first"),
+  pl.col("pairs").list.get(1).alias("last")
+).drop(["pairs"]).join(
+  account_paired_tag_counts.rename({"tags":"first","count":"first_count"}),on="first",how="inner"
+).join(
+  account_paired_tag_counts.rename({"tags":"last","count":"last_count"}),on="last",how="inner"
+)
+"""
--- a/code/survival.R
+++ b/code/survival.R
@ -0,0 +1,55 @@
+library(here)
+library(survival)
+library(ggsurvfit)
+
+source(here("code/helpers.R"))
+options(arrow.skip_nul = TRUE)
+
+a <- load_accounts() %>%
+  filter(!has_moved) %>%
+  filter(locked == FALSE) %>%
+  anti_join(., arrow::read_feather(here("data/scratch/individual_moved_accounts.feather")), by=c("username"="moved_acct", "server"="moved_server")) %>%
+  inner_join(arrow::read_feather(here("data/scratch/metadata.feather")), by="server") %>%
+  filter(created_at > "2023-04-30") %>%
+  filter(created_at <= "2023-06-30") %>%
+  filter(created_at < last_status_at) %>%
+  mutate(jm = server %in% arrow::read_feather(here("data/scratch/joinmastodon.feather"))$domain) %>%
+  mutate(active = last_status_at >= as.Date("2023-12-01")) %>%
+  #mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2023-09-01 00:00:00", tz = "UTC"), last_status_at)) %>%
+  mutate(active_time = difftime(ifelse(active, lubridate::ymd_hms("2023-12-01 00:00:00", tz = "UTC"), last_status_at), created_at, units="days")) %>%
+  #mutate(active_time = difftime(last_status_at, created_at, units="days")) %>%
+  mutate(status = ifelse(active, 0, 1))# %>% filter(followers_count > 0) %>% filter(following_count > 0)
+
+server_summary <- a %>%
+  group_by(server) %>%
+  summarize(cohort_size = n(), .groups = "drop")
+
+sel_a <- a %>%
+  mutate(is_ms = server == "mastodon.social") %>%
+  ungroup() %>%
+  inner_join(server_summary, by = "server") %>% filter(!noindex) #%>% filter(user_count > 100)
+
+cx <- sel_a %>%
+  coxph(Surv(active_time, status) ~ is_ms + jm, data = ., x=TRUE, robust = T, cluster=server)
+
+cz <- cox.zph(cx)
+
+plot_survival <- sel_a %>%
+  #filter(followers_count > 0) %>%
+  #filter(following_count > 0) %>%
+  mutate(id = paste(username, server, sep = "@")) %>%
+  survfit2(
+    Surv(active_time, status) ~ jm + is_ms, # is_jm
+    data = ., id = id,
+    cluster = server,
+    robust = TRUE
+  ) %>%
+  ggsurvfit() +
+  labs(
+    y = "Overall survival probability",
+    x = "Time (days)",
+  ) +
+  scale_fill_discrete(name = "Group", labels = c("Not in JM", "JM", "mastodon.social")) +
+  scale_color_discrete(name = "Group", labels = c("Not in JM", "JM", "mastodon.social")) +
+  theme_bw_small_labels() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom")
--- a/code/tags.py
+++ b/code/tags.py
@ -0,0 +1,209 @@
+import polars as pl
+from scipy.sparse import dok_matrix
+import numpy as np
+from sklearn.cluster import AffinityPropagation
+
+import scipy.sparse.csgraph
+from scipy.sparse import csr_matrix
+from sklearn.cluster import SpectralClustering
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+
+
+def pairwise_sets(input_list_str):
+  input_list = input_list_str.split(',')  
+  return [','.join(sorted([input_list[i], input_list[j]])) for i in range(len(input_list)) for j in range(i+1, len(input_list))]
+
+
+tags = pl.read_ipc("data/tags_filt.feather").filter(pl.col("sensitive") == False)#.filter(pl.col("language") == "en")
+
+CONSENSUS_ACCOUNTS = 250
+
+common_tags = tags.explode("tags").with_columns(
+  pl.col("tags").str.to_lowercase()
+).unique(
+  ["host", "acct", "tags"]
+).group_by("tags").count().filter(
+  pl.col("count") >= CONSENSUS_ACCOUNTS
+).sort("count", descending=True)
+
+filt_tags = tags.explode("tags").with_columns(
+  pl.col("tags").str.to_lowercase()
+).filter(
+  pl.col("tags").is_in(set(common_tags["tags"]))
+).sort(["host", "acct", "id", "tags"])
+
+filt_posts = filt_tags.filter(pl.col("sensitive").not_()).group_by(["host", "id", "acct", "created_at"]).agg([
+  pl.col("tags")
+]).filter(
+  pl.col("tags").list.len() >= 2
+).filter(
+  pl.col("tags").list.len() <= 6
+).with_columns(pl.col("tags").list.join(',').alias("tags")).unique(["host", "acct", "tags"])
+
+pairs_posts = filt_posts.with_columns(
+  pl.col("tags").map_elements(pairwise_sets).alias("pairs")
+).explode("pairs").select(
+  pl.col(["host", "acct", "pairs"])
+).unique()
+# TODO: ^
+
+pairs = pairs_posts.group_by("pairs").count().sort("count", descending=True).with_columns(
+  pl.col("pairs").map_elements(lambda x: x.split(",")).alias("pairs")
+)
+
+account_paired_tag_counts = pairs_posts.with_columns( # was filt_posts
+  pl.col("pairs").alias("tags")
+).with_columns(
+  pl.col("tags").str.split(',')
+).explode("tags").group_by("tags").agg(
+  pl.col("host").len().alias("count")
+).sort("count", descending=True)
+
+total_posts = len(pairs_posts)
+
+df = pairs.with_columns(
+  pl.col("pairs").list.get(0).alias("first"),
+  pl.col("pairs").list.get(1).alias("last")
+).drop(["pairs"]).join(
+  account_paired_tag_counts.rename({"tags":"first","count":"first_count"}),on="first",how="inner"
+).join(
+  account_paired_tag_counts.rename({"tags":"last","count":"last_count"}),on="last",how="inner"
+)
+
+p = df.with_columns(
+  (pl.col("first_count") / total_posts).alias("p1"),
+  (pl.col("last_count") / total_posts).alias("p2"),
+  (pl.col("count") / total_posts).alias("p_joint"),
+).with_columns(
+  (pl.col("p_joint")/(pl.col("p1")*pl.col("p2"))).log(base=2).alias("pmi")
+).sort("pmi")
+
+common_pairs = p.filter(
+  pl.col("count") >= 1
+).sort("pmi", descending=True).filter(
+  pl.col("pmi") > 0
+).filter(
+  pl.col("p_joint")/pl.col("p1") >= 0.01
+).filter(
+  pl.col("p_joint")/pl.col("p2") >= 0.01
+)
+
+most_common_tags = sorted(list(set(common_pairs["first"]).union(common_pairs["last"])))
+string_to_index = {string: index for index, string in enumerate(most_common_tags)}
+mapped_indices = [string_to_index[string] for string in most_common_tags]
+S = csr_matrix((len(most_common_tags), len(most_common_tags)), dtype=np.float64)
+#np.zeros((len(most_common_tags), len(most_common_tags)), dtype=np.float64) #* -100.0
+for r in common_pairs.with_columns(
+    pl.col("first").replace(string_to_index).cast(pl.UInt64),#.map_elements(lambda x: string_to_index[x]).alias("first"),
+    pl.col("last").replace(string_to_index).cast(pl.UInt64)#.map_elements(lambda x: string_to_index[x]).alias("last"),
+).iter_rows(named=True):
+  S[r["first"], r["last"]] = r["pmi"]
+  S[r["last"], r["first"]] = r["pmi"]
+
+n_components, comp_labels = scipy.sparse.csgraph.connected_components(S, directed=False)
+comp_S = S[:, comp_labels == 1][comp_labels == 1, :]
+
+###
+#clustering = AffinityPropagation(affinity='precomputed', damping = 0.95, max_iter = 1000).fit_predict(S)
+#cdata = pl.DataFrame({
+#  "tag": most_common_tags,
+#  "cluster": clustering
+#}).sort("cluster")
+#cdata.group_by("cluster").agg([pl.col("tag").len().alias("count")]).sort("count", descending=True)
+#cdata.filter(pl.col("cluster") == 17)
+###
+
+#spec = SpectralClustering(n_clusters=500, affinity='precomputed', assign_labels='discretize')
+#labels = spec.fit_predict(comp_S)
+
+af_clust = AffinityPropagation(affinity='precomputed', damping = 0.95, max_iter = 1000).fit(comp_S.toarray())
+
+clusters = pl.DataFrame({
+  "tag": np.array(most_common_tags)[comp_labels == 1],
+  "cluster": af_clust.labels_
+}).sort("cluster")
+
+clusters.write_ipc("data/scratch/tag_clusters.feather")
+
+t = tags.explode("tags").rename({"tags":"tag"}).join(clusters, on="tag", how="inner").unique(["host", "id", "cluster"])
+host_totals = t.unique(["host", "tag"]).group_by("host").count().rename({"count":"tag_count"}).join(
+  t.group_by(["host"]).count().rename({"count":"term_count"}),
+  on="host",
+  how="inner"
+)
+tf = t.group_by(["host", "cluster"]).count().join(host_totals, on="host", how="inner").filter(
+  pl.col("tag_count") >= 100
+).with_columns(
+  (pl.col("count")/pl.col("term_count")).alias("tf")
+)
+clust_host_occurs = t.filter(
+  pl.col("host").is_in(set(tf["host"]))
+).unique(["host", "cluster"]).group_by("cluster").count().sort("count")
+idf = clust_host_occurs.with_columns((len(t.unique("host")) / pl.col("count")).log().alias("idf")).rename({"count":"doc_count"})
+tf_idf = tf.join(idf, on=["cluster"], how="inner").with_columns(
+  (pl.col("tf")*pl.col("idf")).alias("tfidf")
+).sort("tfidf", descending=True)
+
+host_names = list(tf_idf.select("host").unique().sort("host")["host"])
+string_to_index_host = {string: index for index, string in enumerate(host_names)}
+#mapped_indices_host = [string_to_index_host[string] for string in host_names]
+host_cluster_matrix = np.zeros((len(host_names), clusters["cluster"].max() + 1), dtype=np.float64)
+for row in tf_idf.iter_rows(named=True):
+  host_cluster_matrix[string_to_index_host[row["host"]], row["cluster"]] = row["tfidf"]
+
+#from sklearn.metrics.pairwise import linear_kernel
+from sklearn.metrics.pairwise import cosine_similarity
+host_similarity = cosine_similarity(host_cluster_matrix)
+
+def most_similar(server):
+  string_to_index_host = {string: index for index, string in enumerate(host_names)}
+  return pl.DataFrame({
+    "server": host_names,
+    "sim": host_similarity[string_to_index_host[server]]
+  }).sort("sim", descending=True)#[0:10]
+
+# Viz
+X_reduced = PCA(n_components=2).fit_transform(host_similarity)
+X_embedded = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3).fit_transform(host_similarity)
+
+af = AffinityPropagation(affinity='precomputed').fit(host_similarity)
+
+pl.DataFrame({
+  "server": host_names,
+  "cluster": af.labels_,
+  "x": X_reduced[:,0],
+  "y": X_reduced[:,1]
+}).write_ipc("data/scratch/host_pca.feather")
+
+pl.DataFrame({
+  "server": host_names,
+  "cluster": af.labels_,
+  "x": X_embedded[:,0],
+  "y": X_embedded[:,1]
+}).write_ipc("data/scratch/host_tsne.feather")
+
+pl.DataFrame({
+  "server": host_names,
+  "x": X_embedded[:,0],
+  "y": X_embedded[:,1]
+}).write_csv("data/scratch/host_pca.csv")
+
+# Using cosine similarity
+"""
+dist_matrix = np.around(1 - host_similarity, 10)
+
+dist_matrix = np.where(np.abs(1 - host_similarity) < 0.000000001, 0, 1 - host_similarity)
+X_sim = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3, metric='precomputed').fit_transform(dist_matrix)
+pl.DataFrame({
+  "server": host_names,
+  "x": X_sim[:,0],
+  "y": X_sim[:,1]
+}).write_csv("data/scratch/host_tsne_sim.csv")
+
+pl.DataFrame({
+  "server": host_names,
+  "x": X_sim[:,0],
+  "y": X_sim[:,1]
+}).filter(pl.col("server") == "scholar.social")
+"""
--- a/code/topic_model_tags.py
+++ b/code/topic_model_tags.py
@ -0,0 +1,287 @@
+import polars as pl
+import numpy as np
+from scipy.sparse import csr_matrix
+from scipy.sparse import lil_matrix
+from gensim.models import HdpModel
+from gensim.corpora import Dictionary
+from gensim.models import TfidfModel
+import scipy
+from sklearn.cluster import AffinityPropagation
+from sklearn.metrics.pairwise import cosine_similarity
+from gensim.models import LsiModel
+
+CONSENSUS_ACCOUNTS = 100
+CONSENSUS_SERVERS = 25
+
+def pairwise_sets(input_list):#_str):
+  #input_list = input_list_str.split(',')  
+  return [','.join(sorted([input_list[i], input_list[j]])) for i in range(len(input_list)) for j in range(i+1, len(input_list))]
+
+
+def common_tags_by_account(tags: pl.DataFrame) -> pl.DataFrame:
+  return tags.explode("tags").with_columns(
+    pl.col("tags").str.to_lowercase()
+  ).unique(
+    ["host", "acct", "tags"]
+  ).group_by("tags").agg([
+    pl.col("acct").len().alias("account_count")
+  ]).sort("account_count", descending=True)
+
+def common_tags_by_server(tags: pl.DataFrame) -> pl.DataFrame:
+  return tags.explode("tags").with_columns(
+    pl.col("tags").str.to_lowercase()
+  ).unique(
+    ["host", "tags"]
+  ).group_by("tags").agg([
+    pl.col("host").len().alias("server_count")
+  ]).sort("server_count", descending=True)
+
+
+def filter_tag_posts_by_tag_count(tags: pl.DataFrame, common_tags: pl.DataFrame, min_tags: int, max_tags: int) -> pl.DataFrame:
+  return tags.filter(pl.col("tags").list.len() <= max_tags).explode("tags").with_columns(
+    pl.col("tags").str.to_lowercase()
+  ).filter(
+    pl.col("tags").is_in(set(common_tags["tags"]))
+  ).sort(["host", "acct", "id", "tags"]).group_by(["host", "id", "acct"]).agg([
+    pl.col("tags")
+  ]).filter(pl.col("tags").list.len() >= min_tags)#.with_columns(
+  #  pl.col("tags").list.join(',').alias("tags")
+  #)#.unique(["host", "acct", "tags"])
+
+
+def document_matrix(term_list: list[str], doc_list: list[list[str]]):
+  term_index = {tag: i for i, tag in enumerate(sorted(term_list))}
+  m = lil_matrix((len(doc_list), len(term_list)), dtype=np.int64)
+  for i, post in enumerate(doc_list):
+    for tag in post:
+      m[i, term_index[tag]] += 1
+  return m, term_index
+
+
+def calculate_pmi(d: pl.DataFrame, min_count=10):
+  doc_freq = d.explode("tags").unique(["host", "id", "tags"]).group_by("tags").count().filter(
+    pl.col("count") >= min_count
+  )
+  return d.with_columns(
+    pl.col("tags").map_elements(pairwise_sets).alias("pairs")
+  ).explode("pairs").group_by("pairs").count().sort("count", descending=True).with_columns(
+    pl.col("pairs").str.split(",").list.get(0).alias("first"),
+    pl.col("pairs").str.split(",").list.get(1).alias("last")
+  ).drop(["pairs"]).join(
+    doc_freq.rename({"tags":"first","count":"first_count"}),on="first",how="inner"
+  ).join(
+    doc_freq.rename({"tags":"last","count":"last_count"}),on="last",how="inner"
+  ).with_columns( #pmi
+    ((pl.col("count") / len(d)) / ((pl.col("first_count") / len(d)) * (pl.col("last_count") / len(d)))).log(base=2).alias("pmi")
+  ).sort("pmi", descending=True)
+
+
+def construct_pmi_matrix(terms_list: list[str], d: pl.DataFrame):
+  term_index = {tag: i for i, tag in enumerate(terms_list)}
+  m = lil_matrix((len(terms_list), len(terms_list)), dtype=np.float64)
+  for row in d.iter_rows(named=True):
+    m[term_index[row["first"]], term_index[row["last"]]] = row["pmi"]
+    m[term_index[row["last"]], term_index[row["first"]]] = row["pmi"]
+  return m, term_index
+
+
+
+def host_clusters(df, group_col="host"):
+  t = df.explode("tags").rename({"tags":"tag"}).join(
+    clusters, on="tag", how="inner"
+  ).unique([group_col, "id", "cluster"])
+  host_totals = t.unique([group_col, "cluster"]).group_by(group_col).count().rename({"count":"unique_cluster_count"}).join(
+    t.unique([group_col, "id"]).group_by([group_col]).count().rename({"count":"host_docs"}),
+    on=group_col,
+    how="inner"
+  ).join(
+    t.group_by(group_col).count().rename({"count":"total_cluster_count"}),
+    on=group_col,
+    how="inner"
+  )
+  tf = t.select(
+    pl.col([group_col, "id", "cluster"])
+  ).group_by([group_col, "cluster"]).count().join(host_totals, on=group_col, how="inner").filter(
+    #pl.col("host_docs") >= 100
+    pl.col("total_cluster_count") >= 0
+  ).with_columns(
+    (pl.col("count")/pl.col("total_cluster_count")).alias("tf")
+  )
+  clust_host_occurs = t.filter(
+    pl.col(group_col).is_in(set(tf[group_col]))
+  ).unique([group_col, "cluster"]).group_by("cluster").count().sort("count")
+  idf = clust_host_occurs.with_columns(
+    (len(t.unique(group_col)) / pl.col("count")).log().alias("idf")).rename({"count":"cluster_doc_count"})
+  tf_idf = tf.join(idf, on=["cluster"], how="inner").with_columns(
+    (pl.col("tf")*pl.col("idf")).alias("tfidf")
+  ).sort("tfidf", descending=True)
+  host_names = sorted(tf_idf.select(group_col).unique().sort(group_col)[group_col])
+  string_to_index_host = {string: index for index, string in enumerate(host_names)}
+  #mapped_indices_host = [string_to_index_host[string] for string in host_names]
+  host_cluster_matrix = np.zeros((len(host_names), clusters["cluster"].max() + 1), dtype=np.float64)
+  for row in tf_idf.iter_rows(named=True):
+    host_cluster_matrix[string_to_index_host[row[group_col]], row["cluster"]] = row["tfidf"]
+  return host_cluster_matrix, tf_idf, host_names, idf
+
+
+def read_tags_file(file: str, accounts: set[str]) -> pl.DataFrame:
+  return pl.read_ipc(file).with_columns(
+    pl.concat_str([pl.col("host"), pl.lit("_"), pl.col("acct")]).alias("account_id")
+  ).filter(
+    pl.col("account_id").is_in(accounts)
+  ).filter(#pl.col("sensitive") == False).filter(
+    pl.col("language") == "en").filter(
+    pl.col("tags").list.len() <= 8
+  ).unique(["host", "id"]).filter(
+    pl.col("host").is_in(set(metadata["server"]))
+  ).select(pl.col(["host", "acct", "id", "sensitive", "created_at", "tags"])).explode("tags").with_columns(
+    pl.col("tags").str.to_lowercase()
+  ).with_columns(
+    pl.when(pl.col("sensitive")).then(pl.col("tags") + "_sensitive").otherwise(pl.col("tags")).alias("tags")
+  ).group_by(["host", "acct", "id", "created_at"]).agg([
+    pl.col("tags")
+  ])
+
+
+
+from code.load_accounts import read_metadata_file
+metadata = read_metadata_file("data/metadata-2024-01-31.feather").select(pl.col(["server", "user_count"])).filter(pl.col("user_count") >= 100)
+accounts = pl.scan_ipc("data/scratch/all_accounts.feather").select(
+  pl.col(["server", "acct", "bot", "noindex", "followers_count", "suspended"])
+).filter(pl.col("bot") == False).filter(
+  pl.col("noindex") == False).filter(
+  pl.col("followers_count") > 1).filter(
+  pl.col("suspended").fill_null(False) == False
+  ).collect().rename({"server":"host"}).select(pl.col(["host", "acct"])).unique(["host", "acct"]).with_columns(
+  pl.concat_str([pl.col("host"), pl.lit("_"), pl.col("acct")]).alias("id")
+)
+#5_789_169
+all_tag_posts = pl.concat([
+  read_tags_file("data/tags_filt.feather", set(accounts["id"].to_list())),
+  read_tags_file("data/tags-202302-202308.feather", set(accounts["id"].to_list()))
+]).unique(["host", "id"])#.filter(pl.col("created_at") >= pl.date(2023, 1, 1)).filter(pl.col("created_at") < pl.date(2023, 8, 1))
+all_tag_posts.write_ipc("data/scratch/all_tag_posts.feather")
+
+all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter(pl.col("created_at") >= pl.date(2023, 2, 1)).filter(pl.col("created_at") < pl.date(2023, 8, 1))
+
+common_tags_account = common_tags_by_account(all_tag_posts)
+common_tags_server = common_tags_by_server(all_tag_posts)
+common_tags = common_tags_account.join(
+  common_tags_server, on="tags", how="inner"
+).filter(pl.col("server_count") >= CONSENSUS_SERVERS).filter(pl.col("account_count") >= CONSENSUS_ACCOUNTS)
+
+common_tags.write_ipc("data/scratch/common_tags.feather")
+all_tag_posts_filtered = filter_tag_posts_by_tag_count(all_tag_posts, common_tags, 2, 5)
+all_tag_posts = None
+pmi = calculate_pmi(all_tag_posts_filtered, min_count=1).filter(pl.col("pmi") > 0)#.filter(pl.col("count") >= 0).filter(pl.col("first_count") >= 5).filter(pl.col("last_count") >= 5)
+
+pmi_terms = sorted(set(pmi["first"]).union(set(pmi["last"])))
+pmi_matrix, term_index = construct_pmi_matrix(pmi_terms, pmi)
+af_clust = AffinityPropagation(affinity='precomputed', damping = 0.85, max_iter = 1000).fit(pmi_matrix.toarray())
+#n_components, comp_labels = scipy.sparse.csgraph.connected_components(pmi_matrix, directed=False)
+clusters = pl.DataFrame({
+  "tag": pmi_terms,
+  "cluster": af_clust.labels_
+}).sort("cluster")
+#clusters = clusters.with_row_index().with_columns(pl.col("index").alias("cluster")).drop("index")
+host_cluster_matrix, tf_idf, host_names, idf = host_clusters(all_tag_posts_filtered, group_col="host")
+host_similarity = cosine_similarity(host_cluster_matrix)
+
+
+def most_similar_servers(server: str):
+  string_to_index_host = {string: index for index, string in enumerate(host_names)}
+  return pl.DataFrame({
+    "server": host_names,
+    "similarity": host_similarity[string_to_index_host[server]]
+  }).sort("similarity", descending=True)
+
+def build_similarity_df():
+  keys = []
+  values = []
+  for i in range(len(host_names)):
+    for j in range(i+1, len(host_names)):
+      if host_similarity[i, j] > 0.0:
+        keys.append(','.join(sorted([host_names[i], host_names[j]])))
+        values.append(host_similarity[i, j])
+  df = pl.DataFrame({
+    "servers": keys,
+    "similarity": values
+  }).sort("similarity", descending=True)
+  return df.with_row_count().with_columns(
+    pl.col("servers").str.split(',').alias("server"),
+    # id column from row number
+  ).explode("server").join(
+    metadata,
+    on="server",
+    how="inner"
+  ).group_by(["row_nr", "similarity"]).agg([
+    pl.col("server")
+  ]).filter(
+    pl.col("server").list.len() == 2
+  )
+
+server_similarities = build_similarity_df()
+
+
+server_similarities.filter(
+  pl.col("similarity") > 0.1
+).with_columns(
+  pl.col("server").list.get(0).alias("Source"),
+  pl.col("server").list.get(1).alias("Target"),
+).rename({"similarity":"Weight"}).select(pl.col(["Source", "Target", "Weight"])).write_csv("data/scratch/similar_servers.csv")
+
+clusters.write_ipc("data/scratch/tag_clusters.feather")
+tf_idf.write_ipc("data/scratch/tag_tfidf.feather")
+
+from sklearn.decomposition import PCA
+pca = PCA(n_components=3)
+p = pca.fit_transform(host_cluster_matrix)
+
+from sklearn.decomposition import NMF
+model = NMF(n_components=32, init='random', random_state=0)
+H = model.fit_transform(host_cluster_matrix)
+W = model.components_
+
+###
+#from sklearn.cluster import AgglomerativeClustering
+#server_clust_ap = AffinityPropagation(affinity='precomputed', damping = 0.95, max_iter = 1000).fit(host_similarity)
+#server_clusters = pl.DataFrame({
+#  "server": host_names,
+#  "cluster": server_clust_ap.labels_
+#}).sort("cluster")
+#server_clusters.filter(pl.col("cluster") == server_clusters.filter(pl.col("server") == "econtwitter.net")["cluster"][0])
+#
+###
+"""
+dist_matrix = np.around(1 - host_similarity, 10)
+ac = AgglomerativeClustering(n_clusters=1500, metric='precomputed', linkage='average').fit(dist_matrix)
+np.array(host_names)[np.where(ac.labels_ == ac.labels_[host_names.index("hci.social")])]
+
+np.array(host_names)[np.where(ac.labels_ == ac.labels_[host_names.index("mastodon.social")])]
+
+###
+# Can we see what servers are most similar to a users' history?
+
+x = pl.scan_ipc("data/tags-2020-2022.feather").filter(
+  pl.col("host") == "mastodon.gamedev.place"
+).filter(pl.col("acct") == "raccoonformality").collect().explode("tags").with_columns(pl.col("tags").str.to_lowercase()).rename({"tags":"tag"}).join(
+  clusters, on="tag", how="inner"
+)
+z = x.group_by(["cluster"]).count().sort("count", descending=True).with_columns(
+  (pl.col("count") / len(x)).alias("tf")
+).join(idf, on="cluster", how="inner").with_columns(
+  (pl.col("tf") * pl.col("idf")).alias("tfidf")
+).sort("tfidf", descending=True)
+
+account_matrix = np.zeros((1, clusters["cluster"].max() + 1), dtype=np.float64)
+for row in z.iter_rows(named=True):
+  account_matrix[0, row["cluster"]] = row["tfidf"]
+
+acc_similarity = pl.DataFrame({
+  "host": host_names,
+  "similarity": cosine_similarity(account_matrix, host_cluster_matrix)[0]
+}).sort("similarity", descending=True)
+
+###
+#cluster_names = clusters.join(common_tags.rename({"tags":"tag"}), on="tag", how="inner").sort("count", descending=True).unique("cluster").select(pl.col(["tag", "cluster"])).sort("cluster")
+"""
--- a/images/joinmastodon-screenshot.png
+++ b/images/joinmastodon-screenshot.png
--- a/images/mastodon-social-signups-2020-11-01.png
+++ b/images/mastodon-social-signups-2020-11-01.png
--- a/index.qmd
+++ b/index.qmd
@ -1,682 +0,0 @@
---
-title: Best Practices for Onboarding on the Fediverse
-short-title: Onboarding Fediverse
-authors:
-  - name: Carl Colglazier
-    affiliation:
-      name: Northwestern University
-      city: Evanston
-      state: Illinois
-      country: United States
-    #roles: writing
-    corresponding: true
-bibliography: references.bib
-acm-metadata:
-  final: false
-  copyright-year: 2024
-  acm-year: 2024
-  copyright: rightsretained
-  doi: XXXXXXX.XXXXXXX
-  conference-acronym: "PACMHCI"
-  #conference-name: |
-  #  Make sure to enter the correct
-  #  conference title from your rights confirmation email
-  #conference-date: June 03--05, 2018
-  #conference-location: Woodstock, NY
-  #price: "15.00"
-  #isbn: 978-1-4503-XXXX-X/18/06
-format:
-  acm-pdf:
-    keep-tex: true
-    documentclass: acmart
-    classoption: [acmsmall,manuscript,screen,authorversion,nonacm,timestamp]
-abstract: |
-  When trying to join the Fediverse, a decentralized collection of interoperable social networking websites, new users face the dillema of choosing a home server. Using trace data from thousands of new Fediverse accounts, we show that this choice matters and significantly effects the probably that the account remains active in the future. We then use these insights from this relationship to build a tool that can help new Fediverse users find a server with a high probability of being a good match based on their interests.
-execute:
-  echo: false
-  error: false
-  freeze: auto
-  fig-width: 6.75
---
-
-```{r}
-#| label: r-setup
-#| output: false
-#| error: false
-#| warning: false
-library(reticulate)
-library(tidyverse)
-library(arrow)
-library(statnet)
-library(network)
-library(survival)
-library(ggsurvfit)
-library(modelsummary)
-library(randomForestSRC)
-library(grid)
-library(scales)
-
-options(arrow.skip_nul = TRUE)
-```
-
-We first explore the extent to which server choice matters. We find that accounts that join smaller, more interest-based servers are more likely to continue posting six months after their creation.
-
-Using these findings, we then propose a tool that can help users find servers that match their interests.
-
-# Background
-
-## Newcomers in Online Communities
-
-Onboarding newcomers is an important thing for online communities. Any community can expect a certain amount of turnover, and so it is important for the long-term health and longevity of the community to be able to bring in new members.
-
-RQ: What server attributes correspond with better newcomer retention?
-
-## Migrations in Online Communities
-
-All online communities and accounts trend toward death.
-
-Online fandom communities, for instance...
-
-
-On Reddit, @newellUserMigrationOnline found that the news aggregator had an advantage of potential competitors because of their catalogue of niche communities: people who migrated to alternative platforms tended to post most often proportionally in popular communities.
-
-+ Fiesler on online fandom communities [@fieslerMovingLandsOnline2020]
-+ TeBlunthuis on competition and mutalism [@teblunthuisIdentifyingCompetitionMutualism2022]
-+ Work on "alt-tech" communities.
-
-# Empirical Setting
-
-The Fediverse is a set of decentralized online social networks which interoperate using shared protocols like ActivityPub.
-
-Mastodon is a software program used by many Fediverse servers and offers a user experience similar to the Tweetdeck client for Twitter. It was first created in late 2016.
-
-Discovery has been challenging on Masotodon. The developers and user base tend to be skeptical of algorithmic intrusions, instead opting for timelines which only show posts in reverse chronological order. Search is also difficult. Public hashtags are searchable, but most servers have traditionally not supported searching keywords or simple strings. Accounts can only be searched using their full `username@server` form.
-
-Mastodon features a "home" timeline which shows all public posts from accounts that share the same home server. On larger servers, this timeline can be unwieldy; however, on smaller servers, this presents the opportunity to discover new posts and users of potential interest.
-
-Mastodon offers its users high levels of data portability. Users can move their accounts accross instances while retaining their follows (their post data; however, does not move with the new account). The choice of an initial instance consequentially is not irreversible.
-
-# Data
-
-```{python}
-#| label: py-preprocess-data
-#| cache: true
-#| output: false
-
-from code.load_accounts import *
-from urllib.parse import urlparse
-
-#accounts = pl.concat(
-#  read_accounts_file("data/accounts.feather"),
-#  read_accounts_file("data/account_lookup_2023.feather")
-#)
-accounts = read_accounts_file(
-  "data/account_lookup_compressed.feather"
-).unique(["account", "server"])
-# Write a parsed accounts file for R to use
-a = accounts.with_columns(
-  pl.col("url").map_elements(
-    lambda x: urlparse(x).netloc.encode().decode('idna')
-  ).alias("host"),
-  pl.col("data_string").str.contains("""\"moved\": \{""").alias("has_moved"),
-  pl.col("data").struct.field("suspended"),
-)
-
-a_save = a.drop(["data", "data_string"])
-a_save.select(
-  sorted(a_save.columns)
-).write_ipc("data/scratch/accounts.feather")
-
-moved_accounts = a.filter(pl.col("has_moved")).with_columns(# Do this again now we know the rows are all moved accounts
-  pl.col("data_string").str.json_decode().alias("data")
-).with_columns(
-  pl.col("data").struct.field("moved")
-).drop_nulls("moved").with_columns(
-  pl.col("moved").struct.field("acct").alias("moved_acct"),
-).with_columns(
-  pl.when(
-    pl.col("moved_acct").str.contains('@')
-  ).then(
-    pl.col("moved_acct").str.split('@').list.get(1)
-  ).otherwise(
-    pl.col("server")
-  ).alias("moved_server")
-)
-
-number_of_accounts = len(a)
-
-popular_servers = a.group_by("server").count().sort("count", descending=True)
-
-common_moves = moved_accounts.group_by(
-  ["server", "moved_server"]
-).count().sort("count", descending=True)
-
-common_moves.write_ipc("data/scratch/moved_accounts.feather")
-common_moves.rename({
-  "server": "Source",
-  "moved_server": "Target",
-}).write_csv("data/scratch/moved_accounts.csv")
-
-maccounts = moved_accounts.select(["account", "server", "moved_server"])
-
-popular_servers.write_ipc("data/scratch/popular_servers.feather")
-
-jm = pl.read_json("data/joinmastodon.json")
-jm.write_ipc("data/scratch/joinmastodon.feather")
-
-read_metadata_file("data/metadata_2023-10-01.feather").drop(
-  ["data", "data_string"]
-).write_ipc("data/scratch/metadata.feather")
-```
-
-```{python}
-#| label: py-preprocess-data2
-#| cache: true
-#| output: false
-
-from code.load_accounts import read_accounts_file
-from urllib.parse import urlparse
-import polars as pl
-
-profile_accounts = read_accounts_file("data/profiles_local.feather")
-p = profile_accounts.with_columns(
-  pl.col("url").map_elements(lambda x: urlparse(x).netloc.encode().decode('idna')).alias("host"),
-  pl.col("username").alias("account"),
-  pl.lit(False).alias("has_moved"),
-  pl.lit(False).alias("suspended")
-).drop(
-  ["data", "data_string"]
-)
-p.select(sorted(p.columns)).write_ipc("data/scratch/accounts_processed_profiles.feather")
-all_accounts = pl.scan_ipc(
-  [
-    "data/scratch/accounts.feather",
-    #"data/scratch/accounts_processed_recent.feather",
-    "data/scratch/accounts_processed_profiles.feather"
-  ]).collect()
-all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "server"]).write_ipc("data/scratch/all_accounts.feather")
-```
-
-
-```{r}
-#| eval: false
-arrow::read_feather(
-  "data/scratch/accounts_processed_profiles.feather",
-  col_select = c(
-    "server", "username", "created_at",
-    "last_status_at", "statuses_count",
-    "has_moved", "bot", "suspended"
-  )) %>%
-  mutate(suspended = replace_na(suspended, FALSE)) %>%
-  filter(!bot) %>%
-  # TODO: what's going on here?
-  filter(!is.na(last_status_at)) %>%
-  # sanity check
-  filter(created_at >= "2022-01-01") %>%
-  filter(created_at < "2024-03-01") %>%
-  # We don't want accounts that were created
-  # and then immediately stopped being active
-  filter(statuses_count > 1) %>%
-  filter(!suspended) %>%
-  filter(!has_moved) %>%
-  filter(server == "mastodon.social") %>%
-  #filter(last_status_at >= created_at) %>%
-  mutate(created_month = format(created_at, "%Y-%m")) %>%
-  group_by(created_month) %>%
-  summarize(count=n()) %>%
-  distinct(created_month, count) %>%
-  ggplot(aes(x=created_month, y=count)) +
-  geom_bar(stat="identity", fill="black") +
-  labs(y="Count", x="Created Month") +
-  theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-```
-
-```{r}
-#| label: fig-account-timeline
-#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
-#| fig-height: 3
-#| fig-width: 6.75
-
-jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
-accounts_unfilt <- arrow::read_feather(
-  "data/scratch/all_accounts.feather", 
-  col_select=c(
-    "server", "username", "created_at", "last_status_at",
-    "statuses_count", "has_moved", "bot", "suspended",
-    "following_count", "followers_count"
-  ))
-accounts <- accounts_unfilt %>%
-  filter(!bot) %>%
-  # TODO: what's going on here?
-  filter(!is.na(last_status_at)) %>%
-  mutate(suspended = replace_na(suspended, FALSE)) %>%
-  # sanity check
-  filter(created_at >= "2020-10-01") %>%
-  filter(created_at < "2024-01-01") %>%
-  # We don't want accounts that were created and then immediately stopped being active
-  filter(statuses_count >= 1) %>%
-  filter(last_status_at >= created_at) %>%
-  mutate(active = last_status_at >= "2024-01-01") %>%
-  mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
-  mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>%
-  #filter(!has_moved)
-acc_data <- accounts %>%
-  #filter(!has_moved) %>%
-  mutate(created_month = format(created_at, "%Y-%m")) %>%
-  mutate(created_week = floor_date(created_at, unit = "week")) %>%
-  mutate(active_now = active) %>%
-  mutate(active = active_time >= 45) %>%
-  mutate("Is mastodon.social" = server == "mastodon.social") %>%
-  mutate(jm = server %in% jm$domain) %>%
-  group_by(created_week) %>%
-  summarize(
-    `JoinMastodon Server` = sum(jm) / n(),
-    `Is mastodon.social` = sum(`Is mastodon.social`)/n(),
-    Suspended = sum(suspended)/n(),
-    Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
-    active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
-    Moved=sum(has_moved)/n(),
-    count=n()) %>%
-  pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
-theme_bw_small_labels <- function(base_size = 9) {
-  theme_bw(base_size = base_size) %+replace%
-    theme(
-      plot.title = element_text(size = base_size * 0.8),
-      plot.subtitle = element_text(size = base_size * 0.75),
-      plot.caption = element_text(size = base_size * 0.7),
-      axis.title = element_text(size = base_size * 0.9),
-      axis.text = element_text(size = base_size * 0.8),
-      legend.title = element_text(size = base_size * 0.9),
-      legend.text = element_text(size = base_size * 0.8)
-    )
-}
-
-p1 <- acc_data %>%
-  ggplot(aes(x=as.Date(created_week), group=1)) +
-  geom_line(aes(y=value, group=Measure, color=Measure)) +
-  geom_point(aes(y=value, color=Measure), size=0.7) +
-  scale_y_continuous(limits = c(0, 1.0)) +
-  labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "4 week") +
-  theme_bw_small_labels() +
-  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
-p2 <- acc_data %>%
-  distinct(created_week, count) %>%
-  ggplot(aes(x=as.Date(created_week), y=count)) +
-  geom_bar(stat="identity", fill="black") +
-  geom_vline(
-    aes(xintercept = as.numeric(as.Date("2022-10-27"))),
-    linetype="dashed", color = "black") +
-  #geom_text(
-  #  aes(x=as.Date("2022-10-27"),
-  #  y=max(count),
-  #  label="  Elon Musk Twitter Acquisition Completed"),
-  #  vjust=-1, hjust=0, color="black") +
-  geom_vline(
-    aes(xintercept = as.numeric(as.Date("2022-04-14"))),
-    linetype="dashed", color = "black") +
-  # https://twitter.com/elonmusk/status/1675187969420828672
-  geom_vline(
-    aes(xintercept = as.numeric(as.Date("2022-12-15"))),
-    linetype="dashed", color = "black") +
-  geom_vline(
-    aes(xintercept = as.numeric(as.Date("2023-07-01"))),
-    linetype="dashed", color = "black") +
-  #scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) +
-  scale_y_continuous(labels = scales::comma) + 
-  labs(y="Count", x="Created Week") +
-  theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "4 week")
-#grid.draw(rbind(ggplotGrob(p1), ggplotGrob(p2), size = "last"))
-library(patchwork)
-p1 + p2 + plot_layout(ncol = 1)
-```
-
-**Mastodon Profiles**: We collected accounts using data previously collected from posts on public Mastodon timelines from October 2020 to January 2024. We then queried for up-to-date information on those accounts including their most recent status and if the account had moved. This gave us a total of `r nrow(accounts)` accounts.
-
-**Moved Profiles**: We found a subset of `r accounts %>% filter(has_moved) %>% nrow` accounts which had moved from one server to another.
-
-# Results
-
-## Activity By Server Size
-
-```{r}
-#| label: fig-active-accounts
-#| eval: false
-#library(betareg)
-library(lme4)
-activity <- arrow::read_feather(
-    "data/scratch/activity.feather",
-    col_select = c("server", "logins")
-  ) %>%
-  arrange(desc(logins)) %>% 
-  mutate(server_count = logins)
-
-account_data <- inner_join(accounts, activity, by="server") %>%
-  mutate(active = active_time >= 45)
-
-a_data <- account_data %>%
-  #mutate(active = active_time >= 45) %>%
-  group_by(server) %>%
-  summarize(active_prop = sum(active)/n(), active_count = sum(active), count=n()) %>%
-  inner_join(., activity, by="server")
-
-a_model <- glmer(active ~ log1p(logins) + (1|server), data=account_data, family=binomial)
-  #betareg(active_prop ~ log10(count), data = a_data)
-
-logins_seq <- seq(min(log1p(account_data$logins)), max(log1p(account_data$logins)), length.out = 100)
-a_pred <- predict(
-  a_model,
-  newdata = data.frame(logins = logins_seq, server = factor(1)),
-  type = "response",
-  re.form = NA)
-pred_data <- data.frame(logins = logins_seq, active_prop = a_pred)
-
-a_data %>%
-  mutate(logins = log1p(logins)) %>%
-  ggplot(aes(y=active_prop, x=logins)) +
-    geom_point(alpha=0.1) +
-    # help here
-    #geom_line(aes(y = a_pred)) +
-    geom_line(data = pred_data, aes(x = logins, y = active_prop), color = "red") + # Use pred_data for line
-    labs(
-      y = "Active after 45 Days",
-      x = "Accounts"
-    ) +
-    scale_x_continuous(labels = scales::comma) +
-    #scale_y_log10() +
-    theme_bw_small_labels()
-```
-
-```{r}
-#| eval: false
-library(fable)
-#library(fable.binary)
-library(tsibble)
-library(lubridate)
-
-ad_time <- account_data |>
-  mutate(created_at = yearweek(created_at)) |>
-  group_by(server, created_at) |>
-  summarize(count = n(), active = sum(active)) |>
-  as_tsibble(key="server", index=created_at)
-```
-
-```{r}
-#| eval: false
-fit <- ad_time |>
-  model(
-    logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
-  )
-```
-
-```{r}
-#| eval: false
-ad_time |>
-  filter(server == "mastodon.social") |>
-  sample_n(100) |>
-  autoplot(active)
-```
-
-
-```{r}
-#| label: fig-account-activity-prop
-#| fig-cap: "Account Activity Over Time"
-#| fig-height: 4
-#| eval: false
-study_period <- 45
-last_day <- "2024-01-15"
-#formerly accounts_processed_recent
-#server_counts <- arrow::read_feather(
-#    "data/scratch/accounts.feather", 
-#    col_select=c("server", "username", "created_at", "bot")
-#  ) %>%
-#  filter(created_at <= "2023-03-01") %>%
-#  filter(!bot) %>%
-#  group_by(server) %>%
-#  summarize(server_count = n()) %>%
-#  arrange(desc(server_count)) %>%
-#  mutate(server_count_bin = floor(log10(server_count)))
-
-metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
-  arrange(desc(user_count)) %>%
-  mutate(server_count = user_count) %>%
-  mutate(server_count_bin = floor(log10(server_count))) %>%
-  mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
-  mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
-
-activity <- arrow::read_feather(
-    "data/scratch/activity.feather",
-    col_select = c("server", "logins")
-  ) %>%
-  arrange(desc(logins)) %>% 
-  mutate(server_count = logins) %>% 
-  mutate(server_count_bin = floor(log10(server_count))) %>%
-  # Merge 4 and 5
-  #mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
-  # Merge below 2
-  #mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
-  mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
-
-jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
-
-a <- accounts %>%
-  filter(!has_moved) %>%
-  #filter(created_at >= "2023-06-01") %>%
-  #filter(created_at < "2023-08-01") %>%
-  filter(created_at >= "2023-10-15") %>%
-  filter(created_at < "2023-12-01") %>%
-  inner_join(activity, by="server") %>%
-  filter(created_at < last_status_at) %>%
-  #mutate(large_server = server_count > 1000) %>%
-  mutate(active_time = as.integer(active_time)) %>%
-  mutate(active_time_weeks = active_time) %>%
-  mutate(status = ifelse(active, 0, 1)) %>%
-  mutate(jm = server %in% jm$domain) #%>% filter(server_count > 0)
-  
-
-survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% #  strata(server_count_bin)
-  ggsurvfit() +
-  add_confidence_interval() +
-  scale_y_continuous(limits = c(0, 1)) +
-  labs(
-    y = "Overall survival probability",
-    x = "Time (days)",
-  ) +
-  #scale_x_continuous(
-  #  breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), 
-  #  labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
-  #) +
-  theme_bw_small_labels() +
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
-```
-
-```{r}
-a %>% filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
-  mutate(is_general = category=="general") %>%
-  mutate(is_en = language == "en") %>%
-  mutate(is_large = last_week_users >= 585) %>% #filter(following_count < 10) %>%
-survfit2(Surv(active_time_weeks, status) ~ is_general + is_large, data = .) %>% #  strata(server_count_bin)
-  ggsurvfit(linetype_aes=TRUE, type = "survival") +
-  add_confidence_interval() +
-  scale_y_continuous(limits = c(0, 1)) +
-  labs(
-    y = "Overall survival probability",
-    x = "Time (days)",
-  ) +
-  #facet_wrap(~strata, nrow = 3) +
-  #scale_x_continuous(
-  #  breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), 
-  #  labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
-  #) +
-  add_censor_mark() +
-  theme_bw_small_labels() +
-  theme(axis.text.x = element_text(angle = 45, hjust = 1))
-```
-
-
-```{r}
-#| eval: false
-library(coxme)
-
-sel_a <- a %>%
-  filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
-  #mutate(is_general = category=="general") %>%
-  rowwise() %>%
-  mutate(is_regional = "regional" %in% categories) %>%
-  mutate(is_general = "general" %in% categories) %>%
-  mutate(is_neither = !(is_regional | is_regional)) %>%
-  mutate(is_en = language == "en") %>%
-  rowwise() %>%
-  mutate(n_categories = length(categories) - is_regional - is_general) %>%
-  mutate(many_categories = n_categories > 0) %>%
-  mutate(is_large = last_week_users >= 585) %>%
-  mutate(follows_someone = followers_count > 0) %>% filter(server_count > 0) %>%
-  ungroup
-#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE) 
-cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE) 
-coxme(Surv(active_time_weeks, status) ~ is_neither + is_general:log1p(server_count) + (1|server), data = sel_a, x=TRUE) 
-cx <- coxph(Surv(active_time_weeks, status) ~ is_neither + many_categories + is_general:log10(server_count), data = sel_a, x=TRUE)
-cz <- cox.zph(cx)
-#plot(cz)
-cz
-```
-
-```{r}
-#| eval: false
-options(rf.cores=2, mc.cores=2)
-for_data <- sel_a #%>% slice_sample(n=2500)
-obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=100, forest=TRUE)
-#predictions <- predict(obj, newdata = newData)$predicted
-#plot(get.tree(obj, 1))
-reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
-plot.subsample(reg.smp.o)
-```
-
-## Moved Accounts
-
-```{r}
-#| label: fig-moved-accounts
-#| fig-height: 4
-#| eval: false
-moved_accounts <- arrow::read_feather("data/scratch/moved_accounts.feather")
-popular_servers <- arrow::read_feather("data/scratch/popular_servers.feather")
-server_movement_data <- left_join(
-  (moved_accounts %>% group_by(server) %>% summarize(out_count = sum(count)) %>% select(server, out_count)),
-  (moved_accounts %>% group_by(moved_server) %>% summarize(in_count = sum(count)) %>% select(moved_server, in_count) %>% rename(server=moved_server)),
-  by="server"
-) %>% replace_na(list(out_count = 0, in_count = 0)) %>%
-  mutate(diff = in_count - out_count) %>%
-  arrange(diff) %>%
-  left_join(., popular_servers, by="server") %>%
-  rename(user_count = count) %>% arrange(desc(user_count))
-server_movement_data %>%
-  ggplot(aes(x=user_count, y=diff)) +
-  geom_point() + scale_x_log10() + theme_bw_small_labels()
-```
-
-If there was no relationship, we would expect these jumps to be random with respect to server size.
-
-```{r}
-popular_servers <-
-  arrow::read_feather("data/scratch/popular_servers.feather")
-moved_accounts <-
-  arrow::read_feather("data/scratch/moved_accounts.feather") %>%
-  # Remove loops
-  filter(server != moved_server)
-activity <-
-  arrow::read_feather("data/scratch/activity.feather",
-                      col_select = c("server", "logins")) %>%
-  arrange(desc(logins))
-popular_and_large_servers <-
-  popular_servers %>% filter(count >= 1) %>%
-  mutate(count = log10(count))
-jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
-ma <- moved_accounts %>%
-  filter(server %in% popular_and_large_servers$server) %>%
-  filter(moved_server %in% popular_and_large_servers$server)
-# Construct network
-edgeNet <- network(ma, matrix.type = "edgelist")
-edgeNet %v% "user_count" <-
-  left_join((as_tibble(edgeNet %v% 'vertex.names') %>% rename(server = value)),
-            popular_and_large_servers,
-            by = "server") %>%
-  select(count) %>%
-  unlist()
-edgeNet %v% "in_jm" <-
-  as_tibble(edgeNet %v% 'vertex.names') %>%
-  mutate(in_jm = value %in% jm$domain) %>%
-  select(in_jm) %>% unlist()
-```
-
-We construct an exponential family random graph model (ERGM) where nodes represent servers and weighted directed edges represent the number of accounts that moved between servers.
-
-$$
-\begin{aligned}
-\text{Sum}_{i,j} = & \beta_1 (log10(\text{user count}_j) - log10(\text{user count}_i)) + \\
-& \beta_2 \\
-& \beta_3 \\
-& \beta_4 \\
-\end{aligned}
-$$
-
-```{r}
-#| label: ergm-model
-#| cache: true
-m1 <-
-  ergm(
-    edgeNet ~ sum +
-      diff("user_count", pow = 1, form = "sum") +
-      nodecov("user_count", form = "sum") +
-      nodematch("in_jm", diff = TRUE, form = "sum"),
-    response = "count",
-    reference =  ~ Binomial(3),
-    control=control.ergm(parallel=4, parallel.type="PSOCK")
-  )
-
-save(m1, file = "data/scratch/ergm-model.rda")
-```
-
-
-```{r}
-#| label: tag-ergm-result
-#| output: asis
-ergm_model <- load("data/scratch/ergm-model.rda")
-
-modelsummary(
-  m1,
-  escape = FALSE,
-  coef_rename = c(
-    "sum" = "$\\beta_0$ Intercept",
-    "diff.sum.t-h.user_count" = "$\\beta_1$ User Count Difference",
-    "nodecov.sum.user_count" = "$\\beta_2$ User Count (Node Covariate)",
-    "nodematch.sum.in_jm.TRUE" = "$\\beta_3$ In JoinMastodon (Both True)",
-    "nodematch.sum.in_jm.FALSE" = "$\\beta_4$ In JoinMastodon (Both False)"
-  ),
-)
-```
-
-We find a strong preference for accounts to move from large servers to smaller servers. 
-
-```{python}
-#| eval: false
-#| include: false
-import random
-
-def simulate_account_moves(origin: str, servers: dict, n: int):
-  server_list = list(set(servers.keys()) - {origin})
-  weights = [servers[x] for x in server_list]
-  return pl.DataFrame({
-    "simulation": list(range(n)),
-    "server": [origin] * n,
-    "moved_server": random.choices(server_list, weights=weights, k=n)
-  })
-
-simulations = pl.concat([simulate_account_moves(row["server"], {x["server"]: x["count"] for x in popular_servers.iter_rows(named=True)}, 1000) for row in maccounts.iter_rows(named=True)])
-m_counts = maccounts.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
-sim_counts = simulations.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
-```
-
-## Tag Clusters
-
-We found _number_ posts which contained between two and five tags.
-
-# References {#references}
--- a/notebooks/_moved.qmd
+++ b/notebooks/_moved.qmd
@ -0,0 +1,218 @@
+---
+title: "Moved Accounts"
+---
+
+```{r}
+library(tidyverse)
+library(jsonlite)
+library(arrow)
+library(statnet)
+library(modelsummary)
+library(here)
+
+options(arrow.skip_nul = TRUE)
+
+jm <- as_tibble(fromJSON(here("data/joinmastodon.json")))
+
+moved_accounts <- arrow::read_feather(here("data/scratch/individual_moved_accounts.feather")) %>%
+  filter(server != "mastodon.nfriedly.com") %>%
+  filter(server != "vivaldi.social")
+
+source(here("code/helpers.R"))
+accounts <- load_accounts(filt = FALSE)
+
+jm_move_counts <- inner_join(moved_accounts, accounts, by=c("moved_server"="server", "moved_acct"="username")) %>%
+  filter(created_at >= as.Date("2023-06-01")) %>%
+  filter(server %in% jm$domain) %>% filter(moved_server %in% jm$domain) %>%
+  filter(server != moved_server) %>%
+  group_by(server, moved_server) %>% summarize(count = n()) %>% arrange(desc(count))
+
+
+is_mastodon <- arrow::read_feather(here("data/nodeinfo-2024-01-31.feather")) %>%
+  filter(str_detect(data_string, '"name": "mastodon"')) %>% select(server)
+
+activity <-
+  arrow::read_feather(here("data/scratch/activity.feather"),
+                      col_select = c("server", "logins")) %>%
+  arrange(desc(logins))
+
+metadata <- arrow::read_feather(here("data/scratch/metadata.feather")) %>%
+  drop_na() %>%
+  filter(server %in% is_mastodon$server) %>%
+  unnest(languages) %>%
+  distinct(server, .keep_all=T) #%>% filter(user_count >= 10)
+```
+
+```{r}
+#| execute: false
+# library(statnet)
+# n = data.frame(
+#   from = c(2,3,4,4),
+#   to = c(1,1,1,3),
+#   count = c(1,2,3,1)
+# )
+# 
+# e <- network(n, matrix.type = "edgelist")
+# e %v% "size" <- c(1,2,1,100)
+# 
+# m2 <-
+#   ergm(
+#     e ~ sum +
+#       nodeocov("size", form = "sum") +
+#       diff("size", dir="h-t", pow = 1, form = "sum"),
+#     response = "count",
+#     reference =  ~ Binomial(3),
+#     control=control.ergm(parallel=4, parallel.type="PSOCK")
+#   )
+# 
+# summary(m2)
+```
+
+
+
+```{r}
+build_network <- function(move_counts, metadata, activity_data) {
+  edgelist <- move_counts %>% 
+    filter(server %in% activity$server) %>% 
+    filter(moved_server %in% activity$server) %>%
+    filter(server %in% metadata$server) %>% 
+    filter(moved_server %in% metadata$server) %>% #filter(server %in% jm$domain) %>% filter(moved_server %in% jm$domain) %>%
+    select(server, moved_server, count) #%>% slice_sample(n=500)
+
+  edgeNet <- network(edgelist, matrix.type = "edgelist", directed=TRUE, loops=FALSE, multiple=FALSE)
+  
+  activity_data <- left_join((as_tibble(edgeNet %v% 'vertex.names') %>% rename(server = value)), activity, by="server")
+  metadata_data <- left_join((as_tibble(edgeNet %v% 'vertex.names') %>% rename(server = value)), metadata, by="server")
+  
+  edgeNet %v% "last_week_users" <- log1p(activity_data$logins)
+  edgeNet %v% "accounts" <- log(metadata_data$user_count)
+  edgeNet %v% "single_user" <- (metadata_data$user_count == 1)
+  edgeNet %v% "dead_server" <- (activity_data$logins == 0)
+  edgeNet %v% "jm" <- edgeNet %v% 'vertex.names' %in% jm$domain
+  edgeNet %v% "approval_required" <- metadata_data$approval_required
+  edgeNet %v% "registrations" <- metadata_data$registrations
+  edgeNet %v% "description" <- metadata_data$description
+  edgeNet %v% "invites_enabled" <- metadata_data$invites_enabled
+  edgeNet %v% "language" <- metadata$languages
+  return(edgeNet)
+}
+
+run_network <- function(network) {
+  model <-
+    ergm(
+      network ~ sum + nonzero +
+        #diff("last_week_users", dir="h-t", pow = 0, form = "sum") +
+        diff("accounts", dir="h-t", pow = 0, form = "sum") + # Do people move to smaller servers?
+        nodeocov("accounts", form = "sum") +                 # Do servers with more accounts have more outflow?
+        nodeifactor("registrations", form = "sum") +         # Do servers with open registration get more inflow?
+        nodematch("language", form = "sum"),
+      response = "count",
+      reference =  ~ Binomial(5),
+      control = control.ergm(MCMLE.maxit = 100, MCMC.effectiveSize = 50)
+    )
+  return(model)
+}
+
+move_counts.early <- inner_join(moved_accounts, accounts, by=c("server"="server", "account"="username")) %>%
+  filter(server %in% is_mastodon$server) %>%
+  filter(moved_server %in% is_mastodon$server) %>%
+  filter(created_at >= as.Date("2022-04-01")) %>%
+  filter(created_at < as.Date("2022-05-01")) %>%
+  filter(server != moved_server) %>%
+  group_by(server, moved_server) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
+  ungroup()
+
+move_counts.late <- inner_join(moved_accounts, accounts, by=c("moved_server"="server", "moved_acct"="username")) %>%
+  filter(server %in% is_mastodon$server) %>%
+  filter(moved_server %in% is_mastodon$server) %>%
+  filter(created_at >= as.Date("2023-10-20")) %>%
+  filter(server != moved_server) %>%
+  group_by(server, moved_server) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
+  ungroup()
+
+move_counts.late2 <- inner_join(moved_accounts, accounts, by=c("server"="server", "account"="username")) %>%
+  filter(server %in% is_mastodon$server) %>%
+  filter(moved_server %in% is_mastodon$server) %>%
+  filter(created_at >= as.Date("2023-10-20")) %>%
+  filter(server != moved_server) %>%
+  group_by(server, moved_server) %>% summarize(count = n()) %>% arrange(desc(count)) %>%
+  ungroup()
+
+edgeNet.early <- build_network(move_counts.early, metadata, activity_data)
+edgeNet.late <- build_network(move_counts.late2, metadata, activity_data)
+```
+
+
+```{r}
+library(GGally)
+
+edgeNet.early %e% "lcount" <- as.integer(log(edgeNet.early %e% "count")) + 1
+edgeNet.early %e% "pcount" <- edgeNet.early %e% "lcount" / max(edgeNet.early %e% "lcount")
+
+ggnet2(
+  edgeNet.early,
+  edge.size="lcount",
+  color = "black",
+  node.alpha = 0.75,
+  edge.alpha = 0.25,
+  edge.label.alpha = "pcount",#0.5,
+  max_size = 10,
+  size = "indegree",
+  #size = "degree",
+  #mode = "target",
+  #size.min = 1.1,
+  arrow.gap = 0.01, arrow.size = 5, arrow.type = "open"
+) + coord_equal() + guides(color = FALSE, size = FALSE)
+```
+
+```{r}
+library(GGally)
+
+edgeNet.late %e% "lcount" <- as.integer(log(edgeNet.late %e% "count")) + 1
+edgeNet.late %e% "pcount" <- edgeNet.late %e% "lcount" / max(edgeNet.late %e% "lcount")
+
+ggnet2(
+  edgeNet.late,
+  edge.size="lcount",
+  color = "black",
+  node.alpha = 0.75,
+  edge.alpha = 0.25,
+  edge.label.alpha = "pcount",#0.5,
+  max_size = 10,
+  size = "indegree",
+  #size = "degree",
+  #mode = "target",
+  #size.min = 1.1,
+  arrow.gap = 0.01, arrow.size = 5, arrow.type = "open"
+) + coord_equal() + guides(color = FALSE, size = FALSE)
+```
+
+
+```{r}
+#| label: tbl-ergm
+#| tbl-cap: ERGM model output
+# #| cache: true
+model.early <- run_network(edgeNet.early)
+model.late <- run_network(edgeNet.late)
+save(model.early, file = here("data/scratch/ergm-model-early.rda"))
+save(model.late, file = here("data/scratch/ergm-model-late.rda"))
+#load(file = here("data/scratch/ergm-model-early.rda"))
+#load(file = here("data/scratch/ergm-model-late.rda"))
+library(kableExtra)
+modelsummary(
+  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
+  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
+  statistic = NULL,
+  gof_omit = ".*",
+  coef_rename = c(
+    "sum" = "(Sum)",
+    "diff.sum0.h-t.accounts" = "Smaller server",
+    "nodeocov.sum.accounts" = "Server size (outgoing)",
+    "nodeifactor.sum.registrations.TRUE" = "Open registrations (incoming)",
+    "nodematch.sum.language" = "Languages match"
+  ),
+  align="lrrrr",
+  stars = c('*' = .05, '**' = 0.01, '***' = .001),
+  ) %>%
+    add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
+```
--- a/notebooks/_push_pull.qmd
+++ b/notebooks/_push_pull.qmd
@ -0,0 +1,409 @@
+```{r}
+#| echo: false
+#| output: false
+#| warning: false
+#| label: push-pull-prep
+library(arrow)
+library(tidyverse)
+library(tsibble)
+library(fable)
+library(lmtest)
+library(jsonlite)
+
+source("code/helpers.R")
+accounts <- load_accounts()
+jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
+```
+
+```{r}
+#| label: prep-break-one-raw-counts
+server_list <- c(
+  "mastodon.social", "mastodon.online"
+)
+
+early.jm_servers <- as_tibble(fromJSON("data/joinmastodon-2020-09-18.json"))$domain
+
+early.day_counts <- accounts %>%
+  filter(created_at < "2021-09-01") %>%
+  mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
+  mutate(server_code = ifelse(server %in% early.jm_servers, "joinmastodon", "other")) %>%
+  mutate(server_code = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
+  mutate(server = ifelse(server == "mastodon.online", "mastodon.online", server_code)) %>%
+    group_by(created_day, server) %>%
+  summarize(count = n(), .groups = "drop") %>%
+  as_tsibble(., key=server, index=created_day) %>%
+  fill_gaps(count=0) %>%
+  mutate(first_open = ((created_day >= "2020-09-18") & (created_day < "2020-11-01"))) %>%
+  #mutate(second_open = ((created_day > "2020-11-02") & (created_day < "2020-11-05"))) %>%
+  mutate(third_open = (created_day >= "2021-04-17")) %>%
+  mutate(open = (first_open | third_open))
+
+early.data_plot <- early.day_counts %>%
+  mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
+  ggplot(aes(x = created_day, y=count)) +
+  geom_rect(data = (early.day_counts %>% filter(open)),
+            aes(xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf),
+            fill = "lightblue", alpha = 0.3) +  # Adjust color and transparency as needed
+  geom_bar(stat="identity") +
+  facet_wrap(~ server, ncol=1, strip.position = "left") + #, scales="free_y") +
+  scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
+  scale_y_log10() +
+  labs(
+    title = "Open registration periods on mastodon.social (August 2020 - August 2021)",
+    x = "Account Created Date",
+    y = "Count"
+  ) +
+  theme_bw_small_labels()
+```
+
+```{r}
+#| label: table-early-open-coefs
+
+if (knitr::is_latex_output()) {
+  format <- "latex"
+} else {
+  format <- "html"
+}
+
+model_data <- early.day_counts %>%
+  mutate(count = log1p(count)) %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number())
+
+fit <- model_data %>%
+  model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
+
+early.table <- fit %>% tidy %>%
+  mutate(p.value = scales::pvalue(p.value)) %>%
+  pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
+  select(-c(.model)) %>%
+  select(term,
+         estimate_mastodon.online, p.value_mastodon.online,
+         estimate_mastodon.social, p.value_mastodon.social,
+         estimate_joinmastodon, p.value_joinmastodon,
+         estimate_other, p.value_other
+         ) %>%
+  #select(term, starts_with("estimate"), starts_with("p.value")) #%>%
+  knitr::kable(
+    .,
+    format = format,
+    col.names = c("Term", "mastodon.online", "", "mastodon.social", "", "joinmastodon", "", "other", ""),
+    digits = 4,
+    align = c("l", "r", "r", "r", "r", "r", "r", "r", "r"),
+    booktabs = T
+  )
+```
+
+```{r}
+#| label: prep-break-two-raw-counts
+email.jm_servers <- as_tibble(fromJSON("data/joinmastodon-2023-08-25.json"))$domain
+
+email.day_counts <- accounts %>%
+  filter(created_at > "2022-07-01") %>%
+  filter(created_at < "2022-10-26") %>%
+  mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
+  mutate(server_code = ifelse(server %in% email.jm_servers, "joinmastodon", "other")) %>%
+  mutate(server = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
+  #mutate(server = server_code) %>%
+  #filter(server != "other") %>%
+  group_by(created_day, server) %>%
+  summarize(count = n(), .groups = "drop") %>%
+  as_tsibble(., key = server, index = created_day) %>%
+  fill_gaps(count = 0) %>%
+  mutate(open = ((created_day < "2022-08-13") |
+                   (created_day > "2022-10-03")))
+
+email.data_plot <- email.day_counts %>%
+  #filter(server != "other") %>%
+  mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
+  ggplot(aes(x = created_day, y = count)) +
+  geom_rect(
+    data = (email.day_counts %>% filter(open)),
+    aes(
+      xmin = created_day - 0.5,
+      xmax = created_day + 0.5,
+      ymin = 0,
+      ymax = Inf
+    ),
+    fill = "lightblue",
+    alpha = 0.3
+  ) +  # Adjust color and transparency as needed
+  geom_bar(stat = "identity") +
+  facet_wrap( ~ server, ncol = 1, strip.position = "left") + #, scales="free_y") +
+  scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
+  labs(
+    title = "Closure of mastodon.social (2022)",
+    x = "Account Created Date",
+    y = "Count"
+  ) +
+  theme_bw_small_labels()
+```
+
+```{r}
+#| label: email-open-coefs
+
+if (knitr::is_latex_output()) {
+  format <- "latex"
+} else {
+  format <- "html"
+}
+
+model_data <- email.day_counts %>%
+  mutate(count = log1p(count)) %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number())
+
+fit <- model_data %>%
+  model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
+
+email.table <- fit %>% tidy %>%
+  mutate(p.value = scales::pvalue(p.value)) %>%
+  pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
+  select(-c(.model)) %>%
+  select(term,
+         estimate_mastodon.social, p.value_mastodon.social,
+         estimate_joinmastodon, p.value_joinmastodon,
+         estimate_other, p.value_other
+         ) %>%
+  knitr::kable(
+    .,
+    format = format,
+    col.names = c("Term", "mastodon.social", "", "joinmastodon", "", "other", ""),
+    digits = 4,
+    align = c("l", "r", "r", "r", "r", "r", "r"),
+    booktabs = T
+  )
+```
+
+
+```{r}
+#| label: prep-break-three-raw-counts
+late.jm_servers <- as_tibble(fromJSON("data/joinmastodon-2023-08-25.json"))$domain
+
+last.day_counts <- accounts %>%
+  filter(created_at > "2022-12-01") %>%
+  filter(created_at < "2023-06-01") %>%
+  mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
+  mutate(server_code = ifelse(server %in% late.jm_servers, "joinmastodon", "other")) %>%
+  mutate(server_code = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
+  mutate(server = server_code) %>%
+  #filter(server != "other") %>%
+  group_by(created_day, server) %>%
+  summarize(count = n(), .groups = "drop") %>%
+  as_tsibble(., key=server, index=created_day) %>%
+  fill_gaps(count=0) %>%
+  mutate(open = (created_day > "2023-02-08") | ((created_day > "2022-12-10") & (created_day < "2022-12-17")))
+
+last.data_plot <- last.day_counts %>%
+  #filter(server != "other") %>%
+  mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
+  ggplot(aes(x = created_day, y=count)) +
+  geom_rect(data = (last.day_counts %>% filter(open)),
+            aes(xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf),
+            fill = "lightblue", alpha = 0.3) +  # Adjust color and transparency as needed
+  geom_bar(stat="identity") +
+  facet_wrap(~ server, ncol=1, strip.position = "left") + #, scales="free_y") +
+  scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
+  #scale_y_log10() +
+  labs(
+    x = "Account Created Date",
+    y = "Count"
+  ) +
+  theme_bw_small_labels()
+
+#library(patchwork)
+#early.data_plot + email.data_plot + last.data_plot + plot_layout(ncol = 1)
+```
+
+```{r}
+#| label: late-open-coefs
+
+if (knitr::is_latex_output()) {
+  format <- "latex"
+} else {
+  format <- "html"
+}
+
+model_data <- last.day_counts %>%
+  mutate(count = log1p(count)) %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number())
+
+fit <- model_data %>%
+  model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
+
+last.table <- fit %>% tidy %>%
+  mutate(p.value = scales::pvalue(p.value)) %>%
+  pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
+  select(-c(.model)) %>%
+  select(term,
+         estimate_mastodon.social, p.value_mastodon.social,
+         estimate_joinmastodon, p.value_joinmastodon,
+         estimate_other, p.value_other
+         ) %>%
+  knitr::kable(
+    .,
+    format = format,
+    col.names = c("Term", "mastodon.social", "", "joinmastodon", "", "other", ""),
+    digits = 4,
+    align = c("l", "r", "r", "r", "r", "r", "r"),
+    booktabs = T
+  )
+```
+
+
+```{r}
+#| eval: false
+library(sandwich)
+model.poisson <- early.day_counts %>%
+  filter(server == "mastodon.online") %>%
+  filter(created_day > "2020-08-01") %>%
+  filter(created_day < "2021-09-01") %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number()) %>%
+  glm(count ~ day*open, data=., family=poisson)
+
+v <- sqrt(diag(vcovHC(model.poisson,  type = "HC0")))
+coeftest(model.poisson, vcovHC(model.poisson, type="HC0"))
+```
+
+<!-- begin section that actually exports -->
+
+::: {.panel-tabset}
+
+#### Early
+
+```{r}
+#| label: fig-break-one-raw-counts
+#| fig-height: 4
+#| fig-width:  6.75
+#| fig-env: figure*
+#| fig-pos: p
+early.data_plot
+```
+
+#### Email
+
+```{r}
+#| label: fig-break-two-raw-counts
+#| fig-height: 3.5
+#| fig-width:  6.75
+#| fig-env: figure*
+#| fig-pos: p
+email.data_plot
+```
+
+#### Last
+
+```{r}
+#| label: fig-break-three-raw-counts
+#| fig-height: 3.5
+#| fig-width:  6.75
+#| fig-env: figure*
+#| fig-pos: p
+last.data_plot
+```
+
+:::
+
+::: {.panel-tabset}
+
+#### Early
+
+::: {#tbl-early .column-page}
+
+```{r}
+early.table
+```
+
+Caption
+
+:::
+
+#### Email
+
+::: {#tbl-email .column-page}
+
+```{r}
+email.table
+```
+
+:::
+
+#### Last
+
+::: {#tbl-last .column-page}
+
+```{r}
+last.table
+```
+
+:::
+
+:::
+
+
+```{r}
+#| label: fig-mastodon-online-forecast
+#| fig-cap: "Historical signup counts for mastodon.online and two alternative forecasts based on whether or not mastoodn.social is accepting signups."
+#| fig-height: 2.7
+#| fig-width:  6.75
+#| exec: false
+#| fig-env: figure*
+model_data <- early.day_counts %>%
+  mutate(count = log1p(count)) %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number())
+
+fit <- model_data %>%
+  model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
+
+f_server <- "mastodon.online"
+
+new_data <- tsibble(
+  created_day = max(model_data$created_day) + 1:100,
+  day = max(model_data$day) + 1:100,
+  server = f_server #""
+)
+
+model.obj <- fit %>%
+  filter(server == f_server) %>%
+  select(arima) %>% pull %>% first 
+
+forecast.open <- model.obj %>%
+  forecast(new_data=(new_data %>% add_column(open = TRUE))) %>%
+  hilo %>% unpack_hilo(`95%`)
+
+forecast.closed <- model.obj %>%
+  forecast(new_data=(new_data %>% add_column(open = FALSE))) %>%
+  hilo %>% unpack_hilo(`95%`)
+
+hist_data <- as_tibble(model_data) %>% filter(server == f_server) %>% select(created_day, server, count, open) %>% rename(count_mean=count)
+
+bind_rows(
+  as_tibble(forecast.open),
+  as_tibble(forecast.closed)
+) %>%
+  rename(count_mean=.mean) %>%
+  ggplot(aes(x=created_day, y=count_mean)) +
+  geom_line(aes(color=open, group=open)) + #, linetype="dashed") +
+  geom_ribbon(aes(ymin=`95%_lower`, ymax=`95%_upper`, group=open, fill=open), alpha=0.25) +
+  geom_line(aes(x=created_day, y=count_mean), data=hist_data) + # , color=open, group=open
+  geom_rect(data = (hist_data %>% filter(open)),
+            aes(xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf),
+            fill = "lightblue", alpha = 0.3) +  # Adjust color and transparency as needed
+  labs(
+    x = "Date",
+    y = "Accounts created (log1p)",
+    color = "Signups open on mastodon.social",
+    fill = "Signups open on mastodon.social"
+  ) +
+  scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
+  theme_bw_small_labels() +
+  theme(legend.position="top", axis.title.x=element_blank())
+```
--- a/notebooks/archive/__scratch.qmd
+++ b/notebooks/archive/__scratch.qmd
@ -0,0 +1,50 @@
+```{r}
+#| eval: false
+arrow::read_feather(
+  "data/scratch/all_accounts.feather",
+  col_select = c(
+    "server", "username", "created_at",
+    "last_status_at", "statuses_count",
+    "has_moved", "bot", "suspended"
+  )) %>%
+  mutate(suspended = replace_na(suspended, FALSE)) %>%
+  filter(!bot) %>%
+  # TODO: what's going on here?
+  filter(!is.na(last_status_at)) %>%
+  # sanity check
+  filter(created_at >= "2022-01-01") %>%
+  filter(created_at < "2024-03-01") %>%
+  # We don't want accounts that were created
+  # and then immediately stopped being active
+  filter(statuses_count > 1) %>%
+  filter(!suspended) %>%
+  filter(!has_moved) %>%
+  filter(server == "mastodon.social") %>%
+  #filter(last_status_at >= created_at) %>%
+  mutate(created_month = format(created_at, "%Y-%m")) %>%
+  group_by(created_month) %>%
+  summarize(count=n()) %>%
+  distinct(created_month, count) %>%
+  ggplot(aes(x=created_month, y=count)) +
+  geom_bar(stat="identity", fill="black") +
+  labs(y="Count", x="Created Month") +
+  theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+```
+
+::: {.content-visible when-format="html"}
+
+## Migrations in Online Communities 
+The Twitter-Mastodon migration is only one entry in a series of migrations between online communities.
+
+@burkeFeedMeMotivating2009 found that social learning could help explain the experiences of newcomers in the early days of Facebook. 
+
+
+ On Reddit, @newellUserMigrationOnline2021 found that the news aggregator had an advantage of potential competitors because of their catalogue of niche communities: people who migrated to alternative platforms tended to post most often proportionally in popular communities.
+
+ Fiesler on online fandom communities [@fieslerMovingLandsOnline2020]
+
+ TeBlunthuis on competition and mutalism [@teblunthuisIdentifyingCompetitionMutualism2022]
+
+ Work on "alt-tech" communities.
+
+:::
--- a/notebooks/archive/_moved_old.qmd
+++ b/notebooks/archive/_moved_old.qmd
@ -0,0 +1,124 @@
+# tets
+
+
+```{r}
+#| label: fig-moved-accounts
+#| fig-height: 4
+#| eval: false
+moved_accounts <- arrow::read_feather("data/scratch/moved_accounts.feather")
+popular_servers <- arrow::read_feather("data/scratch/popular_servers.feather")
+server_movement_data <- left_join(
+  (moved_accounts %>% group_by(server) %>% summarize(out_count = sum(count)) %>% select(server, out_count)),
+  (moved_accounts %>% group_by(moved_server) %>% summarize(in_count = sum(count)) %>% select(moved_server, in_count) %>% rename(server=moved_server)),
+  by="server"
+) %>% replace_na(list(out_count = 0, in_count = 0)) %>%
+  mutate(diff = in_count - out_count) %>%
+  arrange(diff) %>%
+  left_join(., popular_servers, by="server") %>%
+  rename(user_count = count) %>% arrange(desc(user_count))
+server_movement_data %>%
+  ggplot(aes(x=user_count, y=diff)) +
+  geom_point() + scale_x_log10() + theme_bw_small_labels()
+```
+
+If there was no relationship, we would expect these jumps to be random with respect to server size.
+
+```{r}
+popular_servers <-
+  arrow::read_feather("data/scratch/popular_servers.feather")
+moved_accounts <-
+  arrow::read_feather("data/scratch/moved_accounts.feather") %>%
+  # Remove loops
+  filter(server != moved_server)
+activity <-
+  arrow::read_feather("data/scratch/activity.feather",
+                      col_select = c("server", "logins")) %>%
+  arrange(desc(logins))
+popular_and_large_servers <-
+  popular_servers %>% filter(count >= 1) %>%
+  mutate(count = log10(count))
+jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
+ma <- moved_accounts %>%
+  filter(server %in% popular_and_large_servers$server) %>%
+  filter(moved_server %in% popular_and_large_servers$server)
+# Construct network
+edgeNet <- network(ma, matrix.type = "edgelist")
+edgeNet %v% "user_count" <-
+  left_join((as_tibble(edgeNet %v% 'vertex.names') %>% rename(server = value)),
+            popular_and_large_servers,
+            by = "server") %>%
+  select(count) %>%
+  unlist()
+edgeNet %v% "in_jm" <-
+  as_tibble(edgeNet %v% 'vertex.names') %>%
+  mutate(in_jm = value %in% jm$domain) %>%
+  select(in_jm) %>% unlist()
+```
+
+We construct an exponential family random graph model (ERGM) where nodes represent servers and weighted directed edges represent the number of accounts that moved between servers.
+
+$$
+  \begin{aligned}
+\text{Sum}_{i,j} = & \beta_1 (log10(\text{user count}_j) - log10(\text{user count}_i)) + \\
+& \beta_2 \\
+& \beta_3 \\
+& \beta_4 \\
+\end{aligned}
+$$
+  
+  ```{r}
+#| label: ergm-model
+#| cache: true
+m1 <-
+  ergm(
+    edgeNet ~ sum +
+      diff("user_count", pow = 1, form = "sum") +
+      nodecov("user_count", form = "sum") +
+      nodematch("in_jm", diff = TRUE, form = "sum"),
+    response = "count",
+    reference =  ~ Binomial(3),
+    control=control.ergm(parallel=4, parallel.type="PSOCK")
+  )
+
+save(m1, file = "data/scratch/ergm-model.rda")
+```
+
+
+```{r}
+#| label: tag-ergm-result
+#| output: asis
+ergm_model <- load("data/scratch/ergm-model.rda")
+
+modelsummary(
+  m1,
+  escape = FALSE,
+  coef_rename = c(
+    "sum" = "$\\beta_0$ Intercept",
+    "diff.sum.t-h.user_count" = "$\\beta_1$ User Count Difference",
+    "nodecov.sum.user_count" = "$\\beta_2$ User Count (Node Covariate)",
+    "nodematch.sum.in_jm.TRUE" = "$\\beta_3$ In JoinMastodon (Both True)",
+    "nodematch.sum.in_jm.FALSE" = "$\\beta_4$ In JoinMastodon (Both False)"
+  ),
+)
+```
+
+We find a strong preference for accounts to move from large servers to smaller servers. 
+
+```{python}
+#| eval: false
+#| include: false
+import random
+
+def simulate_account_moves(origin: str, servers: dict, n: int):
+  server_list = list(set(servers.keys()) - {origin})
+weights = [servers[x] for x in server_list]
+return pl.DataFrame({
+  "simulation": list(range(n)),
+  "server": [origin] * n,
+  "moved_server": random.choices(server_list, weights=weights, k=n)
+})
+
+simulations = pl.concat([simulate_account_moves(row["server"], {x["server"]: x["count"] for x in popular_servers.iter_rows(named=True)}, 1000) for row in maccounts.iter_rows(named=True)])
+m_counts = maccounts.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
+sim_counts = simulations.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
+```
--- a/notebooks/archive/_server_survival.qmd
+++ b/notebooks/archive/_server_survival.qmd
@ -0,0 +1,218 @@
+
+```{r}
+#| label: fig-active-accounts
+#| eval: false
+#library(betareg)
+library(lme4)
+activity <- arrow::read_feather(
+    "data/scratch/activity.feather",
+    col_select = c("server", "logins")
+  ) %>%
+  arrange(desc(logins)) %>% 
+  mutate(server_count = logins)
+
+account_data <- inner_join(accounts, activity, by="server") %>%
+  mutate(active = active_time >= 45)
+
+a_data <- account_data %>%
+  #mutate(active = active_time >= 45) %>%
+  group_by(server) %>%
+  summarize(active_prop = sum(active)/n(), active_count = sum(active), count=n()) %>%
+  inner_join(., activity, by="server")
+
+a_model <- glmer(active ~ log1p(logins) + (1|server), data=account_data, family=binomial)
+  #betareg(active_prop ~ log10(count), data = a_data)
+
+logins_seq <- seq(min(log1p(account_data$logins)), max(log1p(account_data$logins)), length.out = 100)
+a_pred <- predict(
+  a_model,
+  newdata = data.frame(logins = logins_seq, server = factor(1)),
+  type = "response",
+  re.form = NA)
+pred_data <- data.frame(logins = logins_seq, active_prop = a_pred)
+
+a_data %>%
+  mutate(logins = log1p(logins)) %>%
+  ggplot(aes(y=active_prop, x=logins)) +
+    geom_point(alpha=0.1) +
+    # help here
+    #geom_line(aes(y = a_pred)) +
+    geom_line(data = pred_data, aes(x = logins, y = active_prop), color = "red") + # Use pred_data for line
+    labs(
+      y = "Active after 45 Days",
+      x = "Accounts"
+    ) +
+    scale_x_continuous(labels = scales::comma) +
+    #scale_y_log10() +
+    theme_bw_small_labels()
+```
+
+```{r}
+#| eval: false
+library(fable)
+#library(fable.binary)
+library(tsibble)
+library(lubridate)
+
+ad_time <- account_data |>
+  mutate(created_at = yearweek(created_at)) |>
+  group_by(server, created_at) |>
+  summarize(count = n(), active = sum(active)) |>
+  as_tsibble(key="server", index=created_at)
+```
+
+```{r}
+#| eval: false
+fit <- ad_time |>
+  model(
+    logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
+  )
+```
+
+```{r}
+#| eval: false
+ad_time |>
+  filter(server == "mastodon.social") |>
+  sample_n(100) |>
+  autoplot(active)
+```
+
+
+```{r}
+#| label: fig-account-activity-prop
+#| fig-cap: "Account Activity Over Time"
+#| fig-height: 4
+#| eval: false
+study_period <- 45
+last_day <- "2024-01-15"
+#formerly accounts_processed_recent
+#server_counts <- arrow::read_feather(
+#    "data/scratch/accounts.feather", 
+#    col_select=c("server", "username", "created_at", "bot")
+#  ) %>%
+#  filter(created_at <= "2023-03-01") %>%
+#  filter(!bot) %>%
+#  group_by(server) %>%
+#  summarize(server_count = n()) %>%
+#  arrange(desc(server_count)) %>%
+#  mutate(server_count_bin = floor(log10(server_count)))
+
+metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
+  arrange(desc(user_count)) %>%
+  mutate(server_count = user_count) %>%
+  mutate(server_count_bin = floor(log10(server_count))) %>%
+  mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
+  mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
+
+activity <- arrow::read_feather(
+    "data/scratch/activity.feather",
+    col_select = c("server", "logins")
+  ) %>%
+  arrange(desc(logins)) %>% 
+  mutate(server_count = logins) %>% 
+  mutate(server_count_bin = floor(log10(server_count))) %>%
+  # Merge 4 and 5
+  mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
+  # Merge below 2
+  #mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
+  mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
+
+jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
+
+a <- accounts %>%
+  filter(!has_moved) %>%
+  filter(locked == FALSE) %>%
+  anti_join(., moved_to, by=c("username"="moved_acct", "server"="moved_server")) %>%
+  filter(created_at >= "2022-08-14") %>%
+  filter(created_at < "2022-10-03") %>%
+  #filter(created_at >= "2023-10-15") %>%
+  #filter(created_at < "2023-11-15") %>%
+  inner_join(activity, by="server") %>%
+  filter(created_at < last_status_at) %>%
+  #mutate(large_server = server_count > 1000) %>%
+  mutate(active_time = as.integer(active_time)) %>%
+  mutate(active_time_weeks = active_time) %>%
+  mutate(status = ifelse(active, 0, 1)) %>%
+  mutate(jm = server %in% jm$domain) %>% 
+  mutate(follows_someone = following_count == 0) %>%
+  mutate(has_a_follower = followers_count > 0)
+  #filter(server_count > 0)
+  
+
+survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% #  strata(server_count_bin)
+  ggsurvfit() +
+  add_confidence_interval() +
+  scale_y_continuous(limits = c(0, 1)) +
+  labs(
+    y = "Overall survival probability",
+    x = "Time (days)",
+  ) +
+  #scale_x_continuous(
+  #  breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), 
+  #  labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
+  #) +
+  theme_bw_small_labels() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+```
+
+```{r}
+#| eval: false
+library(coxme)
+
+sel_a <- a %>%
+  #filter(server != "mastodon.social") %>%
+  mutate(is_ms = server == "mastodon.social") %>%
+  filter(!is_ms) %>%
+  filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
+  #mutate(is_general = category=="general") %>% 
+  rowwise() %>%
+  mutate(is_regional = "regional" %in% categories) %>%
+  mutate(is_general = ("general" %in% categories)) %>%
+  mutate(is_neither = !(is_regional | is_regional)) %>%
+  mutate(is_en = language == "en") %>%
+  mutate(n_categories = length(categories) - is_regional - is_general) %>%
+  mutate(many_categories = n_categories > 0) %>%
+  mutate(is_large = last_week_users >= 585) %>%
+  #mutate(follows_someone = followers_count > 0) %>% filter(server_count > 1) %>%
+  #filter(followers_count < 250) %>%
+  ungroup
+
+sel_a <- 
+  inner_join(sel_a, (sel_a %>% group_by(server) %>% summarize(cohort_size=n())), by="server")
+#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE) 
+#cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE) 
+#coxme(Surv(active_time_weeks, status) ~ is_regional + many_categories + is_general*log10(server_count) + (1|server), data = sel_a, x=TRUE)
+
+# coxme(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general*log10(cohort_size) + (1|server), data = sel_a, x=TRUE)
+
+cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general:log1p(cohort_size), data = sel_a, x=TRUE) # log10(server_count)
+cz <- cox.zph(cx)
+#plot(cz)
+cx
+cz
+```
+
+```{r}
+#| eval: false
+options(rf.cores=2, mc.cores=2)
+for_data <- sel_a %>%
+  filter(!is_general) %>%
+  mutate(rn=row_number())
+
+set.seed(123)
+
+data_test <- for_data %>% slice_sample(n = floor(0.4 * nrow(for_data)))
+data_train <- for_data %>% slice(-pull(data_test,rn))
+
+obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ server_count, data = data_train, ntree=100, forest=TRUE)
+
+pred <- predict(obj, data_test)
+reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
+
+
+obj <- rfsrc(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=50, forest=TRUE, importance=TRUE)
+#predictions <- predict(obj, newdata = newData)$predicted
+#plot(get.tree(obj, 1))
+reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
+plot.subsample(reg.smp.o)
+```
--- a/notebooks/archive/_tags.qmd
+++ b/notebooks/archive/_tags.qmd
@ -0,0 +1,27 @@
+
+```{r}
+library(tidyverse)
+library(arrow)
+options(arrow.skip_nul = TRUE)
+source("code/helpers.R")
+```
+
+We use a term frequency-inverse document frequency model to associate the top tags with each server. For the term frequency, we divide the count of the number of accounts which used the tag during the six-month period by the total number of known account-tag pairs on that server; for the inverse document frequency, we divide the total number of servers by count of servers reporting the tag. In this implimentation, we also apply filters such that tags must be used to at least five people on the server to be reported and the tag must be used by at least ten people and at least three servers in the entire known system.
+
+```{r, fig.width=6.75}
+#| label: fig-simulations-rbo
+simulations <- arrow::read_ipc_file("data/scratch/simulation_rbo.feather")
+
+simulations %>%
+  group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>%
+  mutate(ltags = as.integer(log2(tags))) %>%
+  ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) +
+  geom_boxplot() +
+  facet_wrap(~servers, nrow=1) +
+  scale_y_continuous(limits = c(0, 1)) +
+  labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") +
+  theme_minimal() + theme(legend.position = "none")
+```
+
+
+We simulated various scenarios that limit both servers that report data and the number of tags they report. We used rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. @fig-sulations-rbo shows how the average agreement with the baseline scales linearly with the logarithm of the tag count.
--- a/notebooks/survival.qmd
+++ b/notebooks/survival.qmd
@ -0,0 +1,79 @@
+---
+title: "Model for Survival"
+engine: knitr
+---
+
+```{r}
+#| label: setup
+library(here)
+library(survival)
+
+source(here("code/helpers.R"))
+options(arrow.skip_nul = TRUE)
+
+a <-  load_accounts() %>%
+  filter(!has_moved) %>%
+  filter(locked == FALSE) %>%
+  anti_join(., arrow::read_feather(here("data/scratch/individual_moved_accounts.feather")), by=c("username"="moved_acct", "server"="moved_server")) %>%
+  inner_join(arrow::read_feather(here("data/scratch/metadata.feather")), by="server") %>%
+  filter(created_at >= "2023-11-01") %>%
+  filter(created_at <= "2023-11-30") %>%
+  #inner_join(activity, by="server") %>%
+  filter(created_at < last_status_at) %>%
+  mutate(active_time = as.integer(active_time)) %>%
+  mutate(status = ifelse(active, 0, 1)) %>%
+  mutate(jm = server %in% arrow::read_feather(here("data/scratch/joinmastodon.feather"))$domain) %>% 
+  mutate(follows_someone = following_count > 0) %>%
+  mutate(has_a_follower = followers_count > 0)
+```
+
+```{r}
+
+server_summary <- a %>%
+  group_by(server) %>%
+  summarize(cohort_size = n(), .groups = 'drop')
+
+sel_a <- a %>%
+  mutate(is_ms = server == "mastodon.social") %>%
+  ungroup() %>%
+  inner_join(server_summary, by = "server")
+
+cx <- sel_a %>%
+  mutate(c_size = as.factor(as.integer(log10(cohort_size)))) %>%
+  mutate(follows_someone = following_count >= 14) %>%
+  filter(followers_count > 0) %>%
+  filter(following_count > 0) %>%
+  coxph(Surv(active_time_weeks, status) ~ cluster(server) + jm + is_ms, data = ., x=TRUE, robust = T) # cluster(server)
+
+cz <- cox.zph(cx)
+#plot(cz)
+cx
+cz
+```
+
+
+```{r}
+#| label: fig-survival-plot
+#| fig-cap: Survival plot for accounts created in November 2023. Accounts created on mastodon.social had a higher hazard than accounts created on other servers.
+#| fig-width: 6.75
+#| fig-height: 3
+library(ggsurvfit)
+sel_a %>%
+  mutate(c_size = as.factor(as.integer(log10(cohort_size)))) %>%
+  mutate(no_followers = !has_a_follower) %>%
+  filter(followers_count > 0) %>%
+  filter(following_count > 0) %>%
+  filter(statuses_count > 0) %>%
+  mutate(small = cohort_size < 10) %>%
+  mutate(large = cohort_size > 100) %>%
+survfit2(Surv(active_time_weeks, status) ~ jm + strata(is_ms), data = .) %>% # cluster(server) 
+  ggsurvfit() +
+  add_confidence_interval() +
+  #scale_y_continuous(limits = c(0, 1)) +
+  labs(
+    y = "Overall survival probability",
+    x = "Time (days)",
+  ) +
+  theme_bw_small_labels() +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1))
+```
--- a/presentations/Presentation.qmd
+++ b/presentations/Presentation.qmd
@ -0,0 +1,442 @@
+---
+title: "Onboarding the Fediverse"
+subtitle: "Building community discovery in decentralized online social networks"
+author: "Carl Colglazier"
+format: 
+  revealjs:
+    theme: presentation.scss
+    keep-md: true
+knitr:
+  opts_chunk:
+    dev: "ragg_png"
+    retina: 1
+    dpi: 200
+execute:
+  freeze: auto
+  cache: true
+  echo: false
+  fig-width: 5
+  fig-height: 6
+---
+
+## Growth on the Fediverse
+
+
+```{r}
+#| label: fig-account-timeline
+#| fig-height: 3
+#| fig-width: 6.75
+
+library(arrow)
+library(tidyverse)
+library(lubridate)
+library(scales)
+library(here)
+source(here("code/helpers.R"))
+
+jm <- arrow::read_feather(here("data/scratch/joinmastodon.feather"))
+moved_to <- arrow::read_feather(here("data/scratch/individual_moved_accounts.feather"))
+accounts_unfilt <- arrow::read_feather(
+  here("data/scratch/all_accounts.feather"),
+  col_select=c(
+    "server", "username", "created_at", "last_status_at",
+    "statuses_count", "has_moved", "bot", "suspended",
+    "following_count", "followers_count", "locked",
+    "noindex", "group", "discoverable"
+  ))
+accounts <- accounts_unfilt %>%
+  filter(!bot) %>%
+  # TODO: what's going on here?
+  filter(!is.na(last_status_at)) %>%
+  mutate(suspended = replace_na(suspended, FALSE)) %>%
+  # sanity check
+  filter(created_at >= "2020-10-01") %>%
+  filter(created_at < "2024-01-01") %>%
+  # We don't want accounts that were created and then immediately stopped being active
+  filter(statuses_count >= 1) %>%
+  filter(last_status_at >= created_at) %>%
+  mutate(active = last_status_at >= "2024-01-01") %>%
+  mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
+  mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>%
+  #filter(!has_moved)
+acc_data <- accounts %>%
+  #filter(!has_moved) %>%
+  mutate(created_month = format(created_at, "%Y-%m")) %>%
+  mutate(created_week = floor_date(created_at, unit = "week")) %>%
+  mutate(active_now = active) %>%
+  mutate(active = active_time >= 45) %>%
+  mutate("Is mastodon.social" = server == "mastodon.social") %>%
+  mutate(jm = server %in% jm$domain) %>%
+  group_by(created_week) %>%
+  summarize(
+    `JoinMastodon Server` = sum(jm) / n(),
+    `Is mastodon.social` = sum(`Is mastodon.social`)/n(),
+    Suspended = sum(suspended)/n(),
+    Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
+    active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
+    Moved=sum(has_moved)/n(),
+    count=n()) %>%
+  pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
+
+p1 <- acc_data %>%
+  ggplot(aes(x=as.Date(created_week), group=1)) +
+  geom_line(aes(y=value, group=Measure, color=Measure)) +
+  geom_point(aes(y=value, color=Measure), size=0.7) +
+  scale_y_continuous(limits = c(0, 1.0)) +
+  labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") +
+  theme_bw_small_labels() +
+  theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
+p2 <- acc_data %>%
+  distinct(created_week, count) %>%
+  ggplot(aes(x=as.Date(created_week), y=count)) +
+  geom_bar(stat="identity", fill="black") +
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2022-10-27"))),
+    linetype="dashed", color = "black") +
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2022-04-14"))),
+    linetype="dashed", color = "black") +
+  # https://twitter.com/elonmusk/status/1675187969420828672
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2022-12-15"))),
+    linetype="dashed", color = "black") +
+  geom_vline(
+    aes(xintercept = as.numeric(as.Date("2023-07-01"))),
+    linetype="dashed", color = "black") +
+  #scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) +
+  scale_y_continuous(labels = scales::comma) + 
+  labs(y="Count", x="Created Week") +
+  theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week")
+library(patchwork)
+p1 + p2 + plot_layout(ncol = 1)
+```
+
+## The Million Account Elephant in the Room
+
+::::: {.columns}
+
+::: {.column width="40%"}
+![](images/mastodon-social-signups-2020-11-01.png)
+:::
+
+:::: {.column width="60%"}
+
+::: {.smaller}
+
+Mastodon.social (MS), the flagship server from the Mastodon developers, has always been the largest Mastodon server.
+
+The server has been closed to new open registrations many times throughout the years.
+
+:::
+
+::::
+
+:::::
+
+## Closure and Opening of MS (2022) {.tiny}
+
+
+```{r}
+#| fig-width: 9
+library(jsonlite)
+library(here)
+library(tidyverse)
+library(tsibble)
+library(fable)
+
+server_list <- c(
+  "mastodon.social", "mastodon.online"
+)
+
+early.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2020-09-18.json")))$domain
+
+early.day_counts <- accounts %>%
+  filter(created_at < "2021-09-01") %>%
+  mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
+  mutate(server_code = ifelse(server %in% early.jm_servers, "joinmastodon", "other")) %>%
+  mutate(server_code = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
+  mutate(server = ifelse(server == "mastodon.online", "mastodon.online", server_code)) %>%
+    group_by(created_day, server) %>%
+  summarize(count = n(), .groups = "drop") %>%
+  as_tsibble(., key=server, index=created_day) %>%
+  fill_gaps(count=0) %>%
+  mutate(first_open = ((created_day >= "2020-09-18") & (created_day < "2020-11-01"))) %>%
+  #mutate(second_open = ((created_day > "2020-11-02") & (created_day < "2020-11-05"))) %>%
+  mutate(third_open = (created_day >= "2021-04-17")) %>%
+  mutate(open = (first_open | third_open))
+
+early.data_plot <- early.day_counts %>%
+  mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
+  ggplot(aes(x = created_day, y=count)) +
+  geom_rect(data = (early.day_counts %>% filter(open)),
+            aes(xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf),
+            fill = "lightblue", alpha = 0.3) +  # Adjust color and transparency as needed
+  geom_bar(stat="identity") +
+  facet_wrap(~ server, ncol=1, strip.position = "left") + #, scales="free_y") +
+  scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
+  scale_y_log10() +
+  labs(
+    title = "Open registration periods on mastodon.social (August 2020 - August 2021)",
+    x = "Account Created Date",
+    y = "Count"
+  ) +
+  theme_bw_small_labels()
+
+model_data <- early.day_counts %>%
+  mutate(count = log1p(count)) %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number())
+
+fit <- model_data %>%
+  model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
+
+early.table <- fit %>% tidy %>%
+  mutate(p.value = scales::pvalue(p.value)) %>%
+  pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
+  select(-c(.model)) %>%
+  select(term,
+         estimate_mastodon.online, p.value_mastodon.online,
+         estimate_mastodon.social, p.value_mastodon.social,
+         estimate_joinmastodon, p.value_joinmastodon,
+         estimate_other, p.value_other
+         ) %>%
+  #select(term, starts_with("estimate"), starts_with("p.value")) #%>%
+  knitr::kable(
+    .,
+    col.names = c("Term", "mastodon.online", "", "mastodon.social", "", "joinmastodon", "", "other", ""),
+    digits = 4,
+    align = c("l", "r", "r", "r", "r", "r", "r", "r", "r")
+  )
+
+early.data_plot
+```
+## Closure and Opening of MS (2022) {.tiny}
+
+```{r}
+early.table
+```
+
+## Closure and Opening of MS (Early 2023) {.tiny}
+
+```{r}
+#| fig-width: 9
+
+library(jsonlite)
+library(here)
+library(tidyverse)
+library(tsibble)
+library(fable)
+
+email.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain
+
+email.day_counts <- accounts %>%
+  filter(created_at > "2022-07-01") %>%
+  filter(created_at < "2022-10-26") %>%
+  mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
+  mutate(server_code = ifelse(server %in% email.jm_servers, "joinmastodon", "other")) %>%
+  mutate(server = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
+  #mutate(server = server_code) %>%
+  #filter(server != "other") %>%
+  group_by(created_day, server) %>%
+  summarize(count = n(), .groups = "drop") %>%
+  as_tsibble(., key = server, index = created_day) %>%
+  fill_gaps(count = 0) %>%
+  mutate(open = ((created_day < "2022-08-13") |
+                   (created_day > "2022-10-03")))
+
+email.data_plot <- email.day_counts %>%
+  #filter(server != "other") %>%
+  mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
+  ggplot(aes(x = created_day, y = count)) +
+  geom_rect(
+    data = (email.day_counts %>% filter(open)),
+    aes(
+      xmin = created_day - 0.5,
+      xmax = created_day + 0.5,
+      ymin = 0,
+      ymax = Inf
+    ),
+    fill = "lightblue",
+    alpha = 0.3
+  ) +  # Adjust color and transparency as needed
+  geom_bar(stat = "identity") +
+  facet_wrap( ~ server, ncol = 1, strip.position = "left") + #, scales="free_y") +
+  scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
+  labs(
+    title = "Closure of mastodon.social (2022)",
+    x = "Account Created Date",
+    y = "Count"
+  ) +
+  theme_bw_small_labels()
+
+email.data_plot
+```
+
+## Closure and Opening of MS (Early 2023) {.tiny}
+
+```{r}
+model_data <- email.day_counts %>%
+  mutate(count = log1p(count)) %>%
+  ungroup %>%
+  arrange(created_day) %>%
+  mutate(day = row_number())
+
+fit <- model_data %>%
+  model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
+
+email.table <- fit %>% tidy %>%
+  mutate(p.value = scales::pvalue(p.value)) %>%
+  pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
+  select(-c(.model)) %>%
+  select(term,
+         estimate_mastodon.social, p.value_mastodon.social,
+         estimate_joinmastodon, p.value_joinmastodon,
+         estimate_other, p.value_other
+         ) %>%
+  knitr::kable(
+    .,
+    col.names = c("Term", "mastodon.social", "", "joinmastodon", "", "other", ""),
+    digits = 4,
+    align = c("l", "r", "r", "r", "r", "r", "r")
+  )
+
+email.table
+```
+
+## A Change in Strategy
+
+Mastodon has shifted away from _discouraging_ newcomers from using mastodon.social to using the flagship server as the default. 
+
+. . .
+
+Today, almost half of new Mastodon accounts join mastodon.social
+
+<!--- ## Do some servers retain newcomers better than others? --->
+
+## A Change in Strategy
+
+![](images/joinmastodon-screenshot.png)
+
+## Moving Accounts on Mastodon
+
+ Accounts can move freely between Mastodon servers
+ Moved accounts retain their followers (but not their posts)
+
+## Are people moving to larger or smaller servers? {.tiny}
+
+```{r}
+#| label: tbl-ergm
+#| tbl-cap: ERGM model output
+#| cache: true
+load(file = here("data/scratch/ergm-model-early.rda"))
+load(file = here("data/scratch/ergm-model-late.rda"))
+#library(gt)
+library(kableExtra)
+library(modelsummary)
+modelsummary(
+  list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
+  estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
+  statistic = NULL,
+  gof_omit = ".*",
+  coef_rename = c(
+    "sum" = "(Sum)",
+    "diff.sum0.h-t.accounts" = "Smaller server",
+    "nodeocov.sum.accounts" = "Server size (outgoing)",
+    "nodeifactor.sum.registrations.TRUE" = "Open registrations (incoming)",
+    "nodematch.sum.language" = "Languages match"
+  ),
+  align="lrrrr",
+  stars = c('*' = .05, '**' = 0.01, '***' = .001),
+  output = "kableExtra") %>%
+    add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
+```
+
+## The Local Timeline: Mastodon's Secret Killer Feature
+
+While discovery is challenging in decentralized online social networks, joining the right server can make it easier.
+
+If you join an server focused on a particular topic or community of interest, you get a timeline about that topic without having to follow anyone.
+
+## Challenges in Buildling Recommendation Systems on DOSNs {.small}
+
+1. **Tensions around centralization**: a single service providing recommendations for all servers probably won't work.
+1. **Local control**: system should be opt-in, server admins should be able to filter servers they accept data from.
+1. **Computing power**: needs to be able to run on servers with limited resources.
+
+## Concept: Use Hashtags
+
+Advantages:
+
+1. Already have their own table in the database.
+2. Clear opt-in toward public visibility
+
+## Design
+
+For the most popular tags by their local users, each server reports:
+
+1. A list of top tags
+2. The number of accounts using each tag in the last 6 months 
+3. The number of accounts using any tag on the server.
+
+. . .
+
+Weigh the model using term frequency-inverse document frequency (TF-IDF)
+
+## Challenge
+
+How many servers do we need?
+
+How many tags do they need to report?
+
+## Baseline comparison
+
+ Data from all servers with over 100 accounts using hashtags.
+ Use cosine similarity to find pairwise similarity between all servers.
+ Compare to simulations with limits on the number of servers and number of tags reported.
+
+Comparison metric: rank biased overlap (RBO).
+
+## Overlap with Baseline in Various Simulations
+
+```{r}
+#| label: fig-simulations-rbo
+#| fig-width: 10
+simulations <- arrow::read_ipc_file(here("data/scratch/simulation_rbo.feather"))
+
+simulations %>%
+  group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>%
+  mutate(ltags = as.integer(log2(tags))) %>%
+  ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) +
+  geom_boxplot() +
+  facet_wrap(~servers, nrow=1) +
+  scale_y_continuous(limits = c(0, 1)) +
+  labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") +
+  theme_minimal() + theme(legend.position = "none")
+```
+
+## Example Recommendation System
+
+ Use just servers from joinmastodon.org
+ Ask for preferences from a bag of common tags.
+ Suggest top servers according to similarity.
+
+## User 1: education, science, academia
+
+Top suggestions:
+
+ mathstodon.xyz
+ sciences.social
+ mastodon.education
+ hcommons.social
+ mas.to
+
+## User 2: tech, linux, hacking
+
+Top suggestions:
+
+ snabelen.no
+ social.anoxinon.de
+ peoplemaking.games
+ mastodon.gamedev.place
+ discuss.systems
--- a/references.bib
+++ b/references.bib
@ -1,3 +1,19 @@
+@inproceedings{burkeFeedMeMotivating2009,
+  title = {Feed {{Me}}: {{Motivating Newcomer Contribution}} in {{Social Network Sites}}},
+  shorttitle = {Feed {{Me}}},
+  booktitle = {Proceedings of the {{SIGCHI Conference}} on {{Human Factors}} in {{Computing Systems}}},
+  author = {Burke, Moira and Marlow, Cameron and Lento, Thomas},
+  year = {2009},
+  series = {{{CHI}} '09},
+  pages = {945--954},
+  publisher = {{ACM}},
+  address = {{New York, NY, USA}},
+  doi = {10.1145/1518701.1518847},
+  urldate = {2017-08-02},
+  abstract = {Social networking sites (SNS) are only as good as the content their users share. Therefore, designers of SNS seek to improve the overall user experience by encouraging members to contribute more content. However, user motivations for contribution in SNS are not well understood. This is particularly true for newcomers, who may not recognize the value of contribution. Using server log data from approximately 140,000 newcomers in Facebook, we predict long-term sharing based on the experiences the newcomers have in their first two weeks. We test four mechanisms: social learning, singling out, feedback, and distribution. In particular, we find support for social learning: newcomers who see their friends contributing go on to share more content themselves. For newcomers who are initially inclined to contribute, receiving feedback and having a wide audience are also predictors of increased sharing. On the other hand, singling out appears to affect only those newcomers who are not initially inclined to share. The paper concludes with design implications for motivating newcomer sharing in online communities.},
+  isbn = {978-1-60558-246-7}
+}
+
@article{cavaDriversSocialInfluence2023,
  title = {Drivers of Social Influence in the {{Twitter}} Migration to {{Mastodon}}},
  author = {Cava, Lucio La and Aiello, Luca Maria and Tagarelli, Andrea},
@ -14,6 +30,29 @@
  langid = {english}
 }

+@misc{diazUsingMastodonWay2022,
+  title = {Using {{Mastodon}} Is Way Too Complicated to Ever Topple {{Twitter}}},
+  author = {Diaz, Jesus},
+  year = {2022},
+  month = nov,
+  journal = {Fast Company},
+  urldate = {2024-02-22},
+  abstract = {Great idea in theory, a total pain in practice.},
+  howpublished = {https://www.fastcompany.com/90808984/using-mastodon-is-way-too-complicated-to-ever-topple-twitter},
+  langid = {english}
+}
+
+@misc{driscollWeMisrememberEternal2023,
+  title = {Do We Misremember {{Eternal September}}?},
+  shorttitle = {Do We Misremember {{Eternal September}}?},
+  author = {Driscoll, Kevin},
+  year = {2023},
+  month = apr,
+  journal = {FLOW},
+  urldate = {2024-02-22},
+  langid = {american}
+}
+
@article{fieslerMovingLandsOnline2020,
  title = {Moving across Lands: Online Platform Migration in Fandom Communities},
  shorttitle = {Moving across Lands},
@ -29,14 +68,81 @@
  abstract = {When online platforms rise and fall, sometimes communities fade away, and sometimes they pack their bags and relocate to a new home. To explore the causes and effects of online community migration, we examine transformative fandom, a longstanding, technology-agnostic community surrounding the creation, sharing, and discussion of creative works based on existing media. For over three decades, community members have left and joined many different online spaces, from Usenet to Tumblr to platforms of their own design. Through analysis of 28 in-depth interviews and 1,886 survey responses from fandom participants, we traced these migrations, the reasons behind them, and their impact on the community. Our findings highlight catalysts for migration that provide insights into factors that contribute to success and failure of platforms, including issues surrounding policy, design, and community. Further insights into the disruptive consequences of migrations (such as social fragmentation and lost content) suggest ways that platforms might both support commitment and better support migration when it occurs.}
 }

-@article{newellUserMigrationOnline,
+@inproceedings{heFlockingMastodonTracking2023,
+  title = {Flocking to {{Mastodon}}: {{Tracking}} the {{Great Twitter Migration}}},
+  shorttitle = {Flocking to {{Mastodon}}},
+  booktitle = {Proceedings of the 2023 {{ACM}} on {{Internet Measurement Conference}}},
+  author = {He, Jiahui and Zia, Haris Bin and Castro, Ignacio and Raman, Aravindh and Sastry, Nishanth and Tyson, Gareth},
+  year = {2023},
+  month = oct,
+  series = {{{IMC}} '23},
+  pages = {111--123},
+  publisher = {{Association for Computing Machinery}},
+  address = {{New York, NY, USA}},
+  doi = {10.1145/3618257.3624819},
+  urldate = {2024-02-22},
+  abstract = {The acquisition of Twitter by Elon Musk has spurred controversy and uncertainty among Twitter users. The move raised both praise and concerns, particularly regarding Musk's views on free speech. As a result, a large number of Twitter users have looked for alternatives to Twitter. Mastodon, a decentralized micro-blogging social network, has attracted the attention of many users and the general media. In this paper, we analyze the migration of 136,009 users from Twitter to Mastodon. We inspect the impact that this has on the wider Mastodon ecosystem, particularly in terms of user-driven pressure towards centralization. We further explore factors that influence users to migrate, highlighting the effect of users' social networks. Finally, we inspect the behavior of individual users, showing how they utilize both Twitter and Mastodon in parallel. We find a clear difference in the topics discussed on the two platforms. This leads us to build classifiers to explore if migration is predictable. Through feature analysis, we find that the content of tweets as well as the number of URLs, the number of likes, and the length of tweets are effective metrics for the prediction of user migration.},
+  isbn = {9798400703829},
+  keywords = {machine learning,mastodon,topic modeling,twitter,user migration}
+}
+
+@article{hooverMastodonBumpNow2023,
+  title = {The {{Mastodon Bump Is Now}} a {{Slump}}},
+  author = {Hoover, Amanda},
+  year = {2023},
+  month = feb,
+  journal = {Wired},
+  issn = {1059-1028},
+  urldate = {2023-10-21},
+  abstract = {Active users have fallen by more than 1 million since the exodus from Elon Musk's Twitter, suggesting the decentralized platform is not a direct replacement.},
+  chapter = {tags},
+  langid = {american},
+  keywords = {communities,content moderation,elon musk,mastodon,platforms,social,social media,twitter}
+}
+
+@book{krautBuildingSuccessfulOnline2011,
+  ids = {kraut_building_2011,kraut_building_2011-1,kraut_building_2011-3},
+  title = {Building {{Successful Online Communities}}: {{Evidence-Based Social Design}}},
+  shorttitle = {Building {{Successful Online Communities}}},
+  author = {Kraut, Robert E. and Resnick, Paul and Kiesler, Sara},
+  year = {2011},
+  publisher = {{MIT Press}},
+  address = {{Cambridge, Mass}},
+  isbn = {978-0-262-01657-5},
+  lccn = {HM742 .K73 2011},
+  keywords = {Computer networks,internet,Online social networks,Planning,Social aspects,Social aspects Planning,Social psychology}
+}
+
+@article{newellUserMigrationOnline2021,
  title = {User {{Migration}} in {{Online Social Networks}}: {{A Case Study}} on {{Reddit During}} a {{Period}} of {{Community Unrest}}},
  author = {Newell, Edward and Jurgens, David and Saleem, Haji Mohammad and Vala, Hardik and Sassine, Jad and Armstrong, Caitrin and Ruths, Derek},
-  pages = {10},
+  year = {2021},
+  month = aug,
+  journal = {Proceedings of the International AAAI Conference on Web and Social Media},
+  pages = {279--288},
+  doi = {10.1609/icwsm.v10i1.14750},
  abstract = {Platforms like Reddit have attracted large and vibrant communities, but the individuals in those communities are free to migrate to other platforms at any time. History has borne this out with the mass migration from Slashdot to Digg. The underlying motivations of individuals who migrate between platforms, and the conditions that favor migration online are not well-understood. We examine Reddit during a period of community unrest affecting millions of users in the summer of 2015, and analyze large-scale changes in user behavior and migration patterns to Reddit-like alternative platforms. Using self-reported statements from user comments, surveys, and a computational analysis of the activity of users with accounts on multiple platforms, we identify the primary motivations driving user migration. While a notable number of Reddit users left for other platforms, we found that an important pull factor that enabled Reddit to retain users was its long tail of niche content. Other platforms may reach critical mass to support popular or ``mainstream'' topics, but Reddit's large userbase provides a key advantage in supporting niche topics.},
  langid = {english}
 }

+@inproceedings{nicholsonMastodonRulesCharacterizing2023,
+  title = {Mastodon {{Rules}}: {{Characterizing Formal Rules}} on {{Popular Mastodon Instances}}},
+  shorttitle = {Mastodon {{Rules}}},
+  booktitle = {Companion {{Publication}} of the 2023 {{Conference}} on {{Computer Supported Cooperative Work}} and {{Social Computing}}},
+  author = {Nicholson, Matthew N. and Keegan, Brian C and Fiesler, Casey},
+  year = {2023},
+  month = oct,
+  series = {{{CSCW}} '23 {{Companion}}},
+  pages = {86--90},
+  publisher = {{Association for Computing Machinery}},
+  address = {{New York, NY, USA}},
+  doi = {10.1145/3584931.3606970},
+  urldate = {2024-01-16},
+  abstract = {Federated social networking is an increasingly popular alternative to more traditional, centralized forms. Yet, this federated arrangement can lead to dramatically different experiences across the network. Using a sample of the most popular instances on the federated social network Mastodon, we characterize the types of rules present in this emerging space. We then compare these rules to those on Reddit, as an example of a different, less centralized space. Rules on Mastodon often pay particular attention to issues of harassment and hate {\textemdash} strongly reflecting the spirit of the Mastodon Covenant. We speculate that these rules may have emerged in response to problems of other platforms, and reflect a lack of support for instance maintainers. With this work, we call for the development of additional instance-level governance and technical scaffolding, and raise questions for future work into the development, values, and value tensions present in the broader federated social networking landscape.},
+  isbn = {9798400701290},
+  keywords = {community rules,Mastodon,online communities}
+}
+
@inproceedings{teblunthuisIdentifyingCompetitionMutualism2022,
  title = {Identifying Competition and Mutualism between Online Groups},
  booktitle = {International {{AAAI Conference}} on {{Web}} and {{Social Media}} ({{ICWSM}} 2022)},
@ -51,3 +157,36 @@
  abstract = {Platforms often host multiple online groups with highly overlapping topics and members. How can researchers and designers understand how interactions between related groups affect measures of group health? Inspired by population ecology, prior social computing research has studied competition and mutualism among related groups by correlating group size with degrees of overlap in content and membership. The resulting body of evidence is puzzling as overlaps seem sometimes to help and other times to hurt. We suggest that this confusion results from aggregating inter-group relationships into an overall environmental effect instead of focusing on networks of competition and mutualism among groups. We propose a theoretical framework based on community ecology and a method for inferring competitive and mutualistic interactions from time series participation data. We compare population and community ecology analyses of online community growth by analyzing clusters of subreddits with high user overlap but varying degrees of competition and mutualism.},
  keywords = {Computer Science - Human-Computer Interaction,Computer Science - Social and Information Networks}
 }
+
+@article{webberSimilarityMeasureIndefinite2010,
+  title = {A Similarity Measure for Indefinite Rankings},
+  author = {Webber, William and Moffat, Alistair and Zobel, Justin},
+  year = {2010},
+  month = nov,
+  journal = {ACM Transactions on Information Systems},
+  volume = {28},
+  number = {4},
+  pages = {20:1--20:38},
+  issn = {1046-8188},
+  doi = {10.1145/1852102.1852106},
+  urldate = {2024-02-14},
+  abstract = {Ranked lists are encountered in research and daily life and it is often of interest to compare these lists even when they are incomplete or have only some members in common. An example is document rankings returned for the same query by different search engines. A measure of the similarity between incomplete rankings should handle nonconjointness, weight high ranks more heavily than low, and be monotonic with increasing depth of evaluation; but no measure satisfying all these criteria currently exists. In this article, we propose a new measure having these qualities, namely rank-biased overlap (RBO). The RBO measure is based on a simple probabilistic user model. It provides monotonicity by calculating, at a given depth of evaluation, a base score that is non-decreasing with additional evaluation, and a maximum score that is nonincreasing. An extrapolated score can be calculated between these bounds if a point estimate is required. RBO has a parameter which determines the strength of the weighting to top ranks. We extend RBO to handle tied ranks and rankings of different lengths. Finally, we give examples of the use of the measure in comparing the results produced by public search engines and in assessing retrieval systems in the laboratory.},
+  keywords = {probabilistic models,Rank correlation,ranking}
+}
+
+@article{zulliRethinkingSocialSocial2020,
+  title = {Rethinking the ``Social'' in ``Social Media'': {{Insights}} into Topology, Abstraction, and Scale on the {{Mastodon}} Social Network},
+  shorttitle = {Rethinking the ``Social'' in ``Social Media''},
+  author = {Zulli, Diana and Liu, Miao and Gehl, Robert},
+  year = {2020},
+  month = jul,
+  journal = {New Media \& Society},
+  volume = {22},
+  number = {7},
+  pages = {1188--1205},
+  publisher = {{SAGE Publications}},
+  issn = {1461-4448},
+  doi = {10.1177/1461444820912533},
+  urldate = {2022-03-13},
+  abstract = {Online interactions are often understood through the corporate social media (CSM) model where social interactions are determined through layers of abstraction and centralization that eliminate users from decision-making processes. This study demonstrates how alternative social media (ASM)?namely Mastodon?restructure the relationship between the technical structure of social media and the social interactions that follow, offering a particular type of sociality distinct from CSM. Drawing from a variety of qualitative data, this analysis finds that (1) the decentralized structure of Mastodon enables community autonomy, (2) Mastodon?s open-source protocol allows the internal and technical development of the site to become a social enterprise in and of itself, and (3) Mastodon?s horizontal structure shifts the site?s scaling focus from sheer number of users to quality engagement and niche communities. To this end, Mastodon helps us rethink ?the social? in social media in terms of topology, abstraction, and scale.}
+}
--- a/renv.lock
+++ b/renv.lock
@ -8,12 +8,42 @@
      }
    ]
  },
+  "Bioconductor": {
+    "Version": "3.18"
+  },
  "Python": {
-    "Version": "3.12.1",
+    "Version": "3.9.18",
    "Type": "virtualenv",
-    "Name": "./renv/python/virtualenvs/renv-python-3.12"
+    "Name": "./renv/python/virtualenvs/renv-python-3.9"
  },
  "Packages": {
+    "BH": {
+      "Package": "BH",
+      "Version": "1.84.0-0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "a8235afbcd6316e6e91433ea47661013"
+    },
+    "BiocManager": {
+      "Package": "BiocManager",
+      "Version": "1.30.22",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "utils"
+      ],
+      "Hash": "d57e43105a1aa9cb54fdb4629725acb1"
+    },
+    "BiocVersion": {
+      "Package": "BiocVersion",
+      "Version": "3.18.1",
+      "Source": "Bioconductor",
+      "Repository": "Bioconductor 3.18",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "2ecaed86684f5fae76ed5530f9d29c4a"
+    },
    "DBI": {
      "Package": "DBI",
      "Version": "1.2.1",
@ -35,6 +65,23 @@
      ],
      "Hash": "72f87e0092e39384aee16df8d67d7410"
    },
+    "DT": {
+      "Package": "DT",
+      "Version": "0.31",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "crosstalk",
+        "htmltools",
+        "htmlwidgets",
+        "httpuv",
+        "jquerylib",
+        "jsonlite",
+        "magrittr",
+        "promises"
+      ],
+      "Hash": "77b5189f5272ae2b21e3ac2175ad107c"
+    },
    "DiagrammeR": {
      "Package": "DiagrammeR",
      "Version": "1.0.10",
@ -65,7 +112,7 @@
    },
    "MASS": {
      "Package": "MASS",
-      "Version": "7.3-60",
+      "Version": "7.3-60.0.1",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@ -76,7 +123,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "a56a6365b3fa73293ea8d084be0d9bb0"
+      "Hash": "b765b28387acc8ec9e9c1530713cb19c"
    },
    "Matrix": {
      "Package": "Matrix",
@ -126,19 +173,6 @@
      ],
      "Hash": "5ea2700d21e038ace58269ecdbeb9ec0"
    },
-    "RcppEigen": {
-      "Package": "RcppEigen",
-      "Version": "0.3.3.9.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "Rcpp",
-        "stats",
-        "utils"
-      ],
-      "Hash": "acb0a5bf38490f26ab8661b467f4f53a"
-    },
    "RcppTOML": {
      "Package": "RcppTOML",
      "Version": "0.2.2",
@ -164,6 +198,18 @@
      ],
      "Hash": "3e1384ada5d3948b392e98b11434d972"
    },
+    "anytime": {
+      "Package": "anytime",
+      "Version": "0.3.9",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "BH",
+        "R",
+        "Rcpp"
+      ],
+      "Hash": "74a64813f17b492da9c6afda6b128e3d"
+    },
    "arrow": {
      "Package": "arrow",
      "Version": "14.0.0.2",
@ -278,28 +324,6 @@
      ],
      "Hash": "40415719b5a479b87949f3aa0aee737c"
    },
-    "boot": {
-      "Package": "boot",
-      "Version": "1.3-28.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "graphics",
-        "stats"
-      ],
-      "Hash": "9a052fbcbe97a98ceb18dbfd30ebd96e"
-    },
-    "brio": {
-      "Package": "brio",
-      "Version": "1.1.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R"
-      ],
-      "Hash": "68bd2b066e1fe780bbf62fc8bcc36de3"
-    },
    "broom": {
      "Package": "broom",
      "Version": "1.0.5",
@ -436,6 +460,13 @@
      ],
      "Hash": "f20c47fd52fae58b4e377c37bb8c335b"
    },
+    "commonmark": {
+      "Package": "commonmark",
+      "Version": "1.9.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "d691c61bff84bd63c383874d2d0c3307"
+    },
    "conflicted": {
      "Package": "conflicted",
      "Version": "1.2.0",
@ -471,6 +502,19 @@
      ],
      "Hash": "e8a1e41acf02548751f45c718d55aa6a"
    },
+    "crosstalk": {
+      "Package": "crosstalk",
+      "Version": "1.2.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R6",
+        "htmltools",
+        "jsonlite",
+        "lazyeval"
+      ],
+      "Hash": "ab12c7b080a57475248a30f4db6298c0"
+    },
    "curl": {
      "Package": "curl",
      "Version": "5.2.0",
@ -546,34 +590,6 @@
      ],
      "Hash": "59351f28a81f0742720b85363c4fdd61"
    },
-    "desc": {
-      "Package": "desc",
-      "Version": "1.4.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "cli",
-        "utils"
-      ],
-      "Hash": "99b79fcbd6c4d1ce087f5c5c758b384f"
-    },
-    "diffobj": {
-      "Package": "diffobj",
-      "Version": "0.3.5",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "crayon",
-        "methods",
-        "stats",
-        "tools",
-        "utils"
-      ],
-      "Hash": "bcaa8b95f8d7d01a5dedfd959ce88ab8"
-    },
    "digest": {
      "Package": "digest",
      "Version": "0.6.34",
@ -585,6 +601,26 @@
      ],
      "Hash": "7ede2ee9ea8d3edbf1ca84c1e333ad1a"
    },
+    "distributional": {
+      "Package": "distributional",
+      "Version": "0.3.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "digest",
+        "farver",
+        "generics",
+        "ggplot2",
+        "lifecycle",
+        "numDeriv",
+        "rlang",
+        "scales",
+        "stats",
+        "utils",
+        "vctrs"
+      ],
+      "Hash": "0a94c3c917918a1c90f4609171ff41b6"
+    },
    "downloader": {
      "Package": "downloader",
      "Version": "0.4",
@ -638,6 +674,26 @@
      ],
      "Hash": "54ed3ea01b11e81a86544faaecfef8e2"
    },
+    "easylabel": {
+      "Package": "easylabel",
+      "Version": "0.2.7",
+      "Source": "Bioconductor",
+      "Repository": "CRAN",
+      "Requirements": [
+        "DT",
+        "R",
+        "RColorBrewer",
+        "ggplot2",
+        "gtools",
+        "plotly",
+        "rlang",
+        "shiny",
+        "shinybusy",
+        "shinycssloaders",
+        "splus2R"
+      ],
+      "Hash": "d5f363578f9dbfdb46a2aa7d92cc231b"
+    },
    "ellipsis": {
      "Package": "ellipsis",
      "Version": "0.3.2",
@ -723,6 +779,51 @@
      ],
      "Hash": "daf4a1246be12c1fa8c7705a0935c1a0"
    },
+    "fable": {
+      "Package": "fable",
+      "Version": "0.3.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "Rcpp",
+        "distributional",
+        "dplyr",
+        "fabletools",
+        "rlang",
+        "stats",
+        "tibble",
+        "tidyr",
+        "tsibble",
+        "utils"
+      ],
+      "Hash": "63a6c529070640737804d2f0a0d9ffbc"
+    },
+    "fabletools": {
+      "Package": "fabletools",
+      "Version": "0.3.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "distributional",
+        "dplyr",
+        "generics",
+        "ggplot2",
+        "lifecycle",
+        "progressr",
+        "rlang",
+        "stats",
+        "tibble",
+        "tidyr",
+        "tidyselect",
+        "tsibble",
+        "utils",
+        "vctrs"
+      ],
+      "Hash": "025c5b49246221a17a646329f45b5abc"
+    },
    "fansi": {
      "Package": "fansi",
      "Version": "1.0.6",
@ -959,6 +1060,18 @@
      ],
      "Hash": "b29cf3031f49b04ab9c852c912547eef"
    },
+    "gtools": {
+      "Package": "gtools",
+      "Version": "3.9.5",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "methods",
+        "stats",
+        "utils"
+      ],
+      "Hash": "588d091c35389f1f4a9d533c8d709b35"
+    },
    "haven": {
      "Package": "haven",
      "Version": "2.5.4",
@ -1047,6 +1160,21 @@
      ],
      "Hash": "04291cc45198225444a397606810ac37"
    },
+    "httpuv": {
+      "Package": "httpuv",
+      "Version": "1.6.14",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "Rcpp",
+        "later",
+        "promises",
+        "utils"
+      ],
+      "Hash": "16abeb167dbf511f8cc0552efaf05bab"
+    },
    "httr": {
      "Package": "httr",
      "Version": "1.4.7",
@ -1191,6 +1319,17 @@
      ],
      "Hash": "b64ec208ac5bc1852b285f665d6368b3"
    },
+    "later": {
+      "Package": "later",
+      "Version": "1.3.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "Rcpp",
+        "rlang"
+      ],
+      "Hash": "a3e051d405326b8b0012377434c62b37"
+    },
    "lattice": {
      "Package": "lattice",
      "Version": "0.21-9",
@ -1206,6 +1345,16 @@
      ],
      "Hash": "5558c61e0136e247252f5f952cdaad6a"
    },
+    "lazyeval": {
+      "Package": "lazyeval",
+      "Version": "0.2.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "d908914ae53b04d4c0c0fd72ecc35370"
+    },
    "lifecycle": {
      "Package": "lifecycle",
      "Version": "1.0.4",
@ -1219,31 +1368,18 @@
      ],
      "Hash": "b8552d117e1b808b09a832f589b79035"
    },
-    "lme4": {
-      "Package": "lme4",
-      "Version": "1.1-35.1",
+    "lmtest": {
+      "Package": "lmtest",
+      "Version": "0.9-40",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
-        "MASS",
-        "Matrix",
        "R",
-        "Rcpp",
-        "RcppEigen",
-        "boot",
        "graphics",
-        "grid",
-        "lattice",
-        "methods",
-        "minqa",
-        "nlme",
-        "nloptr",
-        "parallel",
-        "splines",
        "stats",
-        "utils"
+        "zoo"
      ],
-      "Hash": "07fb0c5b727b15b0ce40feb641498e4e"
+      "Hash": "c6fafa6cccb1e1dfe7f7d122efd6e6a7"
    },
    "lpSolveAPI": {
      "Package": "lpSolveAPI",
@ -1313,16 +1449,6 @@
      ],
      "Hash": "18e9c28c1d3ca1560ce30658b22ce104"
    },
-    "minqa": {
-      "Package": "minqa",
-      "Version": "1.2.6",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "Rcpp"
-      ],
-      "Hash": "f48238f8d4740426ca12f53f27d004dd"
-    },
    "modelr": {
      "Package": "modelr",
      "Version": "0.1.11",
@ -1428,15 +1554,15 @@
      ],
      "Hash": "8d1938040a05566f4f7a14af4feadd6b"
    },
-    "nloptr": {
-      "Package": "nloptr",
-      "Version": "2.0.3",
+    "numDeriv": {
+      "Package": "numDeriv",
+      "Version": "2016.8-1.1",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
-        "testthat"
+        "R"
      ],
-      "Hash": "277c67a08f358f42b6a77826e4492f79"
+      "Hash": "df58958f293b166e4ab885ebcad90e02"
    },
    "openssl": {
      "Package": "openssl",
@ -1516,21 +1642,6 @@
      ],
      "Hash": "15da5a8412f317beeee6175fbc76f4bb"
    },
-    "pkgbuild": {
-      "Package": "pkgbuild",
-      "Version": "1.4.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "callr",
-        "cli",
-        "desc",
-        "processx"
-      ],
-      "Hash": "c0143443203205e6a2760ce553dafc24"
-    },
    "pkgconfig": {
      "Package": "pkgconfig",
      "Version": "2.0.3",
@ -1541,26 +1652,37 @@
      ],
      "Hash": "01f28d4278f15c76cddbea05899c5d6f"
    },
-    "pkgload": {
-      "Package": "pkgload",
-      "Version": "1.3.4",
+    "plotly": {
+      "Package": "plotly",
+      "Version": "4.10.4",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
        "R",
-        "cli",
-        "crayon",
-        "desc",
-        "fs",
-        "glue",
-        "methods",
-        "pkgbuild",
+        "RColorBrewer",
+        "base64enc",
+        "crosstalk",
+        "data.table",
+        "digest",
+        "dplyr",
+        "ggplot2",
+        "htmltools",
+        "htmlwidgets",
+        "httr",
+        "jsonlite",
+        "lazyeval",
+        "magrittr",
+        "promises",
+        "purrr",
        "rlang",
-        "rprojroot",
-        "utils",
-        "withr"
+        "scales",
+        "tibble",
+        "tidyr",
+        "tools",
+        "vctrs",
+        "viridisLite"
      ],
-      "Hash": "876c618df5ae610be84356d5d7a5d124"
+      "Hash": "a1ac5c03ad5ad12b9d1597e00e23c3dd"
    },
    "png": {
      "Package": "png",
@ -1572,13 +1694,6 @@
      ],
      "Hash": "bd54ba8a0a5faded999a7aab6e46b374"
    },
-    "praise": {
-      "Package": "praise",
-      "Version": "1.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "a555924add98c99d2f411e37e7d25e9f"
-    },
    "prettyunits": {
      "Package": "prettyunits",
      "Version": "1.2.0",
@ -1616,6 +1731,34 @@
      ],
      "Hash": "f4625e061cb2865f111b47ff163a5ca6"
    },
+    "progressr": {
+      "Package": "progressr",
+      "Version": "0.14.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "digest",
+        "utils"
+      ],
+      "Hash": "ac50c4ffa8f6a46580dd4d7813add3c4"
+    },
+    "promises": {
+      "Package": "promises",
+      "Version": "1.2.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R6",
+        "Rcpp",
+        "fastmap",
+        "later",
+        "magrittr",
+        "rlang",
+        "stats"
+      ],
+      "Hash": "0d8a15c9d000970ada1ab21405387dee"
+    },
    "ps": {
      "Package": "ps",
      "Version": "1.7.6",
@ -1942,6 +2085,67 @@
      ],
      "Hash": "3838071b66e0c566d55cc26bd6e27bf4"
    },
+    "shiny": {
+      "Package": "shiny",
+      "Version": "1.8.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "bslib",
+        "cachem",
+        "commonmark",
+        "crayon",
+        "ellipsis",
+        "fastmap",
+        "fontawesome",
+        "glue",
+        "grDevices",
+        "htmltools",
+        "httpuv",
+        "jsonlite",
+        "later",
+        "lifecycle",
+        "methods",
+        "mime",
+        "promises",
+        "rlang",
+        "sourcetools",
+        "tools",
+        "utils",
+        "withr",
+        "xtable"
+      ],
+      "Hash": "3a1f41807d648a908e3c7f0334bf85e6"
+    },
+    "shinybusy": {
+      "Package": "shinybusy",
+      "Version": "0.3.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "htmltools",
+        "htmlwidgets",
+        "jsonlite",
+        "shiny"
+      ],
+      "Hash": "cfb1edf0a7b334747556da1598ef67b1"
+    },
+    "shinycssloaders": {
+      "Package": "shinycssloaders",
+      "Version": "1.0.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "digest",
+        "glue",
+        "grDevices",
+        "shiny"
+      ],
+      "Hash": "f39bb3c44a9b496723ec7e86f9a771d8"
+    },
    "sna": {
      "Package": "sna",
      "Version": "2.7-2",
@ -1955,6 +2159,27 @@
      ],
      "Hash": "050b4098cef7fa1827c5c199559fde1f"
    },
+    "sourcetools": {
+      "Package": "sourcetools",
+      "Version": "0.1.7-1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "5f5a7629f956619d519205ec475fe647"
+    },
+    "splus2R": {
+      "Package": "splus2R",
+      "Version": "1.3-4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "methods"
+      ],
+      "Hash": "429a251c74ad7106391056fa88f73f7f"
+    },
    "statnet": {
      "Package": "statnet",
      "Version": "2019.6",
@ -2099,35 +2324,6 @@
      ],
      "Hash": "49f1958da2a787ab5dfc833676a0b450"
    },
-    "testthat": {
-      "Package": "testthat",
-      "Version": "3.2.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "brio",
-        "callr",
-        "cli",
-        "desc",
-        "digest",
-        "evaluate",
-        "jsonlite",
-        "lifecycle",
-        "magrittr",
-        "methods",
-        "pkgload",
-        "praise",
-        "processx",
-        "ps",
-        "rlang",
-        "utils",
-        "waldo",
-        "withr"
-      ],
-      "Hash": "4767a686ebe986e6cb01d075b3f09729"
-    },
    "textshaping": {
      "Package": "textshaping",
      "Version": "0.3.7",
@ -2270,6 +2466,27 @@
      ],
      "Hash": "f50614d2145ba1012b62ff75f2129d5b"
    },
+    "tsibble": {
+      "Package": "tsibble",
+      "Version": "1.1.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "anytime",
+        "dplyr",
+        "ellipsis",
+        "generics",
+        "lifecycle",
+        "lubridate",
+        "methods",
+        "rlang",
+        "tibble",
+        "tidyselect",
+        "vctrs"
+      ],
+      "Hash": "d5da786ac5a28f62ca2eb8255ad7b9f3"
+    },
    "tsna": {
      "Package": "tsna",
      "Version": "0.3.5",
@ -2395,24 +2612,6 @@
      ],
      "Hash": "390f9315bc0025be03012054103d227c"
    },
-    "waldo": {
-      "Package": "waldo",
-      "Version": "0.5.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "diffobj",
-        "fansi",
-        "glue",
-        "methods",
-        "rematch2",
-        "rlang",
-        "tibble"
-      ],
-      "Hash": "c7d3fd6d29ab077cbac8f0e2751449e6"
-    },
    "withr": {
      "Package": "withr",
      "Version": "3.0.0",
@ -2449,12 +2648,39 @@
      ],
      "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61"
    },
+    "xtable": {
+      "Package": "xtable",
+      "Version": "1.8-4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "stats",
+        "utils"
+      ],
+      "Hash": "b8acdf8af494d9ec19ccb2481a9b11c2"
+    },
    "yaml": {
      "Package": "yaml",
      "Version": "2.3.8",
      "Source": "Repository",
      "Repository": "CRAN",
      "Hash": "29240487a071f535f5e5d5a323b7afbd"
+    },
+    "zoo": {
+      "Package": "zoo",
+      "Version": "1.8-12",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "grDevices",
+        "graphics",
+        "lattice",
+        "stats",
+        "utils"
+      ],
+      "Hash": "5c715954112b45499fb1dadc6ee6ee3e"
    }
  }
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -7,10 +7,12 @@ executing==2.0.1
 ipykernel==6.29.0
 ipython==8.20.0
 jedi==0.19.1
+joblib==1.3.2
 jupyter_client==8.6.0
 jupyter_core==5.7.1
 matplotlib-inline==0.1.6
 nest-asyncio==1.6.0
+numpy==1.26.4
 packaging==23.2
 parso==0.8.3
 pexpect==4.9.0
@ -23,9 +25,12 @@ pure-eval==0.2.2
 Pygments==2.17.2
 python-dateutil==2.8.2
 pyzmq==25.1.2
+scikit-learn==1.4.0
+scipy==1.12.0
 setuptools==69.0.3
 six==1.16.0
 stack-data==0.6.3
+threadpoolctl==3.2.0
 tornado==6.4
 traitlets==5.14.1
 wcwidth==0.2.13
--- a/template.tex
+++ b/template.tex
@ -0,0 +1,145 @@
+\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
+\usepackage[submission]{aaai24}  % DO NOT CHANGE THIS
+\usepackage{times}  % DO NOT CHANGE THIS
+\usepackage{helvet}  % DO NOT CHANGE THIS
+\usepackage{courier}  % DO NOT CHANGE THIS
+\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
+\usepackage{graphicx} % DO NOT CHANGE THIS
+\urlstyle{rm} % DO NOT CHANGE THIS
+\def\UrlFont{\rm}  % DO NOT CHANGE THIS
+\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
+\frenchspacing  % DO NOT CHANGE THIS
+\setlength{\pdfpagewidth}{8.5in} % DO NOT CHANGE THIS
+\setlength{\pdfpageheight}{11in} % DO NOT CHANGE THIS
+%
+% These are recommended to typeset algorithms but not required. See the subsubsection on algorithms. Remove them if you don't have algorithms in your paper.
+%\usepackage{algorithm}
+%\usepackage{algorithmic}
+
+%
+% These are are recommended to typeset listings but not required. See the subsubsection on listing. Remove this block if you don't have listings in your paper.
+%\usepackage{newfloat}
+%\usepackage{listings}
+%\DeclareCaptionStyle{ruled}{labelfont=normalfont,labelsep=colon,strut=off} % DO NOT CHANGE THIS
+%\lstset{%
+%	basicstyle={\footnotesize\ttfamily},% footnotesize acceptable for monospace
+%	numbers=left,numberstyle=\footnotesize,xleftmargin=2em,% show line numbers, remove this entire line if you don't want the numbers.
+%	aboveskip=0pt,belowskip=0pt,%
+%	showstringspaces=false,tabsize=2,breaklines=true}
+%\floatstyle{ruled}
+%\newfloat{listing}{tb}{lst}{}
+%\floatname{listing}{Listing}
+%
+% Keep the \pdfinfo as shown here. There's no need
+% for you to add the /Title and /Author tags.
+\pdfinfo{
+/TemplateVersion (2024.1)
+}
+
+% DISALLOWED PACKAGES
+% \usepackage{authblk} -- This package is specifically forbidden
+% \usepackage{balance} -- This package is specifically forbidden
+% \usepackage{color (if used in text)
+% \usepackage{CJK} -- This package is specifically forbidden
+% \usepackage{float} -- This package is specifically forbidden
+% \usepackage{flushend} -- This package is specifically forbidden
+% \usepackage{fontenc} -- This package is specifically forbidden
+% \usepackage{fullpage} -- This package is specifically forbidden
+% \usepackage{geometry} -- This package is specifically forbidden
+% \usepackage{grffile} -- This package is specifically forbidden
+% \usepackage{hyperref} -- This package is specifically forbidden
+% \usepackage{navigator} -- This package is specifically forbidden
+% (or any other package that embeds links such as navigator or hyperref)
+% \indentfirst} -- This package is specifically forbidden
+% \layout} -- This package is specifically forbidden
+% \multicol} -- This package is specifically forbidden
+% \nameref} -- This package is specifically forbidden
+% \usepackage{savetrees} -- This package is specifically forbidden
+% \usepackage{setspace} -- This package is specifically forbidden
+% \usepackage{stfloats} -- This package is specifically forbidden
+% \usepackage{tabu} -- This package is specifically forbidden
+% \usepackage{titlesec} -- This package is specifically forbidden
+% \usepackage{tocbibind} -- This package is specifically forbidden
+% \usepackage{ulem} -- This package is specifically forbidden
+% \usepackage{wrapfig} -- This package is specifically forbidden
+% DISALLOWED COMMANDS
+% \nocopyright -- Your paper will not be published if you use this command
+% \addtolength -- This command may not be used
+% \balance -- This command may not be used
+% \baselinestretch -- Your paper will not be published if you use this command
+% \clearpage -- No page breaks of any kind may be used for the final version of your paper
+% \columnsep -- This command may not be used
+% \newpage -- No page breaks of any kind may be used for the final version of your paper
+% \pagebreak -- No page breaks of any kind may be used for the final version of your paperr
+% \pagestyle -- This command may not be used
+% \tiny -- This is not an acceptable font size.
+% \vspace{- -- No negative value may be used in proximity of a caption, figure, table, section, subsection, subsubsection, or reference
+% \vskip{- -- No negative value may be used to alter spacing above or below a caption, figure, table, section, subsection, subsubsection, or reference
+
+\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numbers are desired.
+
+\usepackage{amsmath}
+\usepackage{booktabs}
+
+\usepackage{siunitx}
+\newcolumntype{d}{S[
+    input-open-uncertainty=,
+    input-close-uncertainty=,
+    parse-numbers = false,
+    table-align-text-pre=false,
+    table-align-text-post=false
+ ]}
+
+
+\def\tightlist{}
+\def\phantomsection{}
+\newcommand\hypertarget[2]{#2}
+\newcommand\texorpdfstring[2]{#1}
+\newcommand\bookmarksetup[1]{}
+\newcommand\href[1]{#1}
+
+\usepackage{longtable}
+%\renewenvironment{longtable}{\begin{center}\begin{tabular}}{\end{tabular}\end{center}}
+%\def\endhead{}
+%\renewcommand{\toprule}[2]{\hline}
+%\renewcommand{\midrule}[2]{\hline}
+%renewcommand{\bottomrule}[2]{\hline}
+% long table two column hack
+\makeatletter
+\let\oldlt\longtable
+\let\endoldlt\endlongtable
+\def\longtable{\@ifnextchar[\longtable@i \longtable@ii}
+\def\longtable@i[#1]{\begin{figure}[htbp]
+\begin{minipage}{0.5\textwidth}
+\onecolumn
+\oldlt[#1]
+}
+\def\longtable@ii{\begin{figure}[htbp]
+\begin{minipage}{0.5\textwidth}
+\onecolumn
+\oldlt
+}
+\def\endlongtable{\endoldlt
+\end{minipage}
+\twocolumn
+\end{figure}}
+\makeatother
+
+\title{$title$}
+
+\begin{document}
+
+\maketitle
+
+$if(abstract)$
+\begin{abstract}
+$abstract$
+\end{abstract}
+$endif$
+
+$body$
+
+\bibliography{$bibliography$}
+
+\end{document}