diff --git a/.gitignore b/.gitignore index 78b7d08..b07ec85 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ index_files/ *_cache/ *_files/ *.pdf.md +*.ttt +*.fff +*.revealjs.md # R stuff .Rproj.user diff --git a/_extensions/carlcolglazier/ic2s2/_extension.yml b/_extensions/carlcolglazier/ic2s2/_extension.yml new file mode 100644 index 0000000..53253cc --- /dev/null +++ b/_extensions/carlcolglazier/ic2s2/_extension.yml @@ -0,0 +1,20 @@ +title: IC2S2 Extended Abstract Format Template +author: Carl Colglazier +version: 2024.0.1 +contributes: + formats: + common: + knitr: + opts_chunk: + echo: false + pdf: + template: conf_template.tex + documentclass: article + papersize: a4paper + fontsize: 12pt + mainfont: "Times New Roman" + fig-pos: "htp" + reference-section-title: "References" + template-partials: + - "partials/before-body.tex" + - "partials/title.tex" diff --git a/_extensions/carlcolglazier/ic2s2/conf_template.tex b/_extensions/carlcolglazier/ic2s2/conf_template.tex new file mode 100644 index 0000000..e4d39df --- /dev/null +++ b/_extensions/carlcolglazier/ic2s2/conf_template.tex @@ -0,0 +1,91 @@ +\PassOptionsToPackage{unicode$for(hyperrefoptions)$,$hyperrefoptions$$endfor$}{hyperref} + +\documentclass[a4paper,12pt]{article} + +\usepackage[utf8]{inputenc} +\usepackage[english]{babel} +\usepackage{authblk} +\usepackage{graphicx} +\usepackage{mathptmx} +\usepackage[singlespacing]{setspace} +\usepackage[headheight=1in,margin=1in]{geometry} +\usepackage{fancyhdr} +\usepackage{lipsum} + +\usepackage[nolists,noheads,nomarkers]{endfloat} +\usepackage{hyperref} + +% +\usepackage{siunitx} + +\renewcommand{\headrulewidth}{0pt} +\setlength{\parindent}{0pt} +\pagestyle{fancy} + +\makeatletter +\def\@maketitle{% + \newpage + + \begin{center}% + \let \footnote \thanks + {\LARGE \@title \par}% + \end{center}% + \par + \vskip 0.1em} +\makeatother + +\fancyhf{} % clear all header and footer fields +\lhead{} % left header +\rhead{} % right header +\lfoot{} % left footer +\chead{% + 10$$^{th}$$ International Conference on Computational Social Science IC$$^{2}$$S$$^{2}$$\\ + July 17-20, 2024, Philadelphia, USA% +} + +$if(mainfont)$ +\ifPDFTeX +\else +\babelfont{rm}[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$} +\fi +$endif$ + +$pandoc.tex()$ + +% Set up bibliography +$if(biblio-config)$ +$if(natbib)$ +\usepackage[$natbiboptions$]{natbib} +\bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$} +$endif$ +$if(biblatex)$ +\usepackage[$if(biblio-style)$style=$biblio-style$,$endif$$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$]{biblatex} +$for(bibliography)$ +\addbibresource{$bibliography$} +$endfor$ +$endif$ +$endif$ +$if(nocite-ids)$ +\nocite{$for(nocite-ids)$$it$$sep$, $endfor$} +$endif$ + +\title{$title$} + +\date{} + +\begin{document} +$title.tex()$ + +\begin{center} +\textit{Keywords: social media, decentralized online social networks, social network analysis, recommender systems, collaborative filtering} +\newline +\end{center} + +$before-body.tex()$ + +$body$ + +$before-bib.tex()$ + +$biblio.tex()$ +\end{document} \ No newline at end of file diff --git a/_extensions/carlcolglazier/ic2s2/partials/before-body.tex b/_extensions/carlcolglazier/ic2s2/partials/before-body.tex new file mode 100644 index 0000000..5f90938 --- /dev/null +++ b/_extensions/carlcolglazier/ic2s2/partials/before-body.tex @@ -0,0 +1 @@ +\section*{Extended Abstract} \ No newline at end of file diff --git a/_extensions/carlcolglazier/ic2s2/partials/title.tex b/_extensions/carlcolglazier/ic2s2/partials/title.tex new file mode 100644 index 0000000..fd26689 --- /dev/null +++ b/_extensions/carlcolglazier/ic2s2/partials/title.tex @@ -0,0 +1,9 @@ +\maketitle +\thispagestyle{fancy} + +$if(keywords)$ +\begin{center} + \textit{Keywords: $for(keywords)$$keywords$$sep$, $endfor$} + \newline +\end{center} +$endif$ diff --git a/_quarto.yml b/_quarto.yml index 60cb1f6..9a2a292 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -13,6 +13,7 @@ manuscript: # - notebook: _tags.qmd # - notebook: _pull_pull.qmd - notebook: notebooks/_moved.qmd + - notebook: notebooks/arima.qmd # - notebook: Presentation.qmd environment: renv.lock format: diff --git a/article.qmd b/article.qmd index dacc2e8..58713f1 100644 --- a/article.qmd +++ b/article.qmd @@ -124,28 +124,28 @@ Onboarding newcomers is an important part of the lifecycle of online communities ## The Mastodon Migrations -Mastodon saw a surge in interest in 2022 and 2023, particularly after Elon Musk's Twitter acquisition. In particular, four events of interests drove measurable increases in new users to the network: the announcement of the acquisition (April 14, 2022), the closing of the acquisition (October 27, 2022), a day when Twitter suspended a number of prominent journalists (December 15, 2022), and a day when Twitter experienced an outage and started rate limiting accounts (July 1, 2023). Many Twitter accounts announced they were setting up Mastodon accounts and linked their new accounts to their followers, often using tags like #TwitterMigration [@heFlockingMastodonTracking2023] and driving interest in Mastodon in a process @cavaDriversSocialInfluence2023 found consistent with social influence theory. - -The series of migrations of new users into Mastodon in many ways reflect folk stories of "Eternal Septembers" on previous communication networks, where a large influx of newcomers challenged the existing norms [@driscollWeMisrememberEternal2023]. Many Mastodon servers do have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Variation amoung servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. - -Some media outlets have framed reports on Mastodon [@hooverMastodonBumpNow2023] through what @zulliRethinkingSocialSocial2020 calls the "Killer Hype Cycle", whereby the media finds a new alterntive social media platform, declares it a potential killer of some established platform, and laters calls it a failure if it does not displace the existing platform. Such framing fails to take systems like the Fediverse seriously for their own merits: completely replacing existing commercial systems is not the only way to measure success, nor does it account for the real value the Fediverse provides for its millions of active users. - -# Data - ```{r} #| label: fig-account-timeline #| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts." -#| fig-height: 3 +#| fig-height: 2.75 #| fig-width: 6.75 #| fig-env: figure* -#| fig-pos: htb! +#| fig-pos: tb! library(here) source(here("code/helpers.R")) account_timeline_plot() ``` -**Mastodon Profiles**: We collected accounts using data previously collected from posts on public Mastodon timelines from October 2020 to January 2024. We then queried for up-to-date information on those accounts including their most recent status and if the account had moved. This gave us a total of N accounts. Note that because we got updated information on each account, we include only accounts on servers which still exist and which returned records for the account. +Mastodon saw a surge in interest in 2022 and 2023, particularly after Elon Musk's Twitter acquisition. In particular, four events of interests drove measurable increases in new users to the network: the announcement of the acquisition (April 14, 2022), the closing of the acquisition (October 27, 2022), a day when Twitter suspended a number of prominent journalists (December 15, 2022), and a day when Twitter experienced an outage and started rate limiting accounts (July 1, 2023). Many Twitter accounts announced they were setting up Mastodon accounts and linked their new accounts to their followers, often using tags like #TwitterMigration [@heFlockingMastodonTracking2023] and driving interest in Mastodon in a process @cavaDriversSocialInfluence2023 found consistent with social influence theory. + +The series of migrations of new users into Mastodon in many ways reflect folk stories of "Eternal Septembers" on previous communication networks, where a large influx of newcomers challenged the existing norms [@driscollWeMisrememberEternal2023]. Many Mastodon servers do have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Variation amoung servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. + +Some media outlets have framed reports on Mastodon [@hooverMastodonBumpNow2023] through what @zulliRethinkingSocialSocial2020 calls the "Killer Hype Cycle", whereby the media finds a new alterntive social media platform, declares it a potential killer of some established platform, and laters calls it a failure if it does not displace the existing platform. Such framing fails to take systems like the Fediverse seriously for their own merits: completely replacing existing commercial systems is not the only way to measure success, nor does it account for the real value the Fediverse provides for its millions of active users. + +# Data + +**Mastodon Profiles**: We collected accounts using data previously collected from posts on public Mastodon timelines from October 2020 to August 2023. We then queried for up-to-date information on those accounts including their most recent status and if the account had moved as of February 2024. This gave us a total of N accounts. Note that because we got updated information on each account, we include only accounts on servers which still exist and which returned records for the account. **Moved Profiles**: We found a subset of N accounts which had moved from one server to another. @@ -153,85 +153,9 @@ account_timeline_plot() # Analysis and Results -## Competition Among Servers in Attracting Newcomers - -_How does mastodon.social factor into the aggregate Mastodon onboarding process?_ - -::: {#fig-mastodon-online-signup-disabled width=50% .content-visible when-format="html"} - - -![](images/mastodon-social-signups-2020-11-01.png){fig-env="figure" width=6cm height=6cm} - -The main page of mastodon.social as viewed by a logged out web browser on November 1, 2020. The sign-up form is blurred out and instead there is a message suggesting to either sign up on mastodon.online or see a list of servers accepting new accounts at joinmastodon.org. - -::: - -Throughout its history, Mastodon's flagship server, mastodon.social, has allowed and disallowed open sign-ups at various times. When the website did not allow sign-ups, it displayed a message redirecting those interested in signing up for an account to mastodon.social or alternatively to a list of potential servers at joinmastodon.com. - -We found three main periods during which mastodon.social did not accept new signups by first noting the times in @fig-account-timeline where the proportion of new accounts on mastodon.social drops to zero. We then used the Internet Archive to verify that signups were disabled during these periods. - -1. An extended period of through the end of October 2020. - -2. A temporary issue when the email host limited the server in mid-2022. - -3. Two periods in late 2022 and early 2023. - -We construct an interrupted time series using an autoregressive integrated moving average (ARIMA) model for sign-ups on mastodon.social, the servers linked in joinmastodon.org, and all other servers. For the first period, we also include mastodon.online since mastodon.social linked to it directly during that time. - -::: {.content-visible when-format="html"} - -$$ -\begin{aligned} -y_t &= \beta_0 + \beta_1 \text{open}_t + \beta_2 \text{day}_t + \beta_3 (\text{open} \times \text{day})_t \\ -&\quad + \beta_4 \sin\left(\frac{2\pi t}{7}\right) + \beta_5 \cos\left(\frac{2\pi t}{7}\right) \\ -&\quad + \beta_6 \sin\left(\frac{4\pi t}{7}\right) + \beta_7 \cos\left(\frac{4\pi t}{7}\right) \\ -&\quad + \phi_1 y_{t-1} + \phi_2 y_{t-2} + \epsilon_t -\end{aligned} -$$ - -where $y_t$ is the number of new accounts on a server at time $t$, $\text{open}_t$ is a binary variable indicating if the server is open to new sign-ups, $\text{day}_t$ is an increasing integer represnting the date, and $\epsilon_t$ is a white noise error term. We use the sine and cosine terms to account for weekly seasonality. - - -| Period | Setting | Significant | -|------------|:----------------|:----| -| 2020-2021 | mastodon.online | Yes | -| | JoinMastodon | No | -| | Other | No | -| Mid 2022 | JoinMastodon | No | -| | Other | No | -| Early 2022 | JoinMastodon | No | -| | Other | No | - -: Results from ARIMA models for the number of new accounts on mastodon.social, mastodon.online, servers linked in joinmastodon.org, and all other servers. - -::: - -::: {.content-visible when-format="pdf+icwsm} - -```{=latex} -\begin{table}[!ht] - \centering - \begin{tabular}{|l|l|l|} - \hline - Period & Setting & Significant \\ \hline - 2020-2021 & mastodon.online & Yes \\ \hline - ~ & JoinMastodon & No \\ \hline - ~ & Other & No \\ \hline - Mid 2022 & JoinMastodon & No \\ \hline - ~ & Other & No \\ \hline - Early 2022 & JoinMastodon & No \\ \hline - ~ & Other & No \\ \hline - \end{tabular} -\end{table} -``` - -::: - - - ## Survival Model -_Are accounts on mastodon.social less likely to remain active than accounts on other servers?_ +*Are accounts on suggested general servers less likely to remain active than accounts on other servers?* ```{r, cache.extra = tools::md5sum("code/survival.R")} #| cache: true @@ -244,16 +168,45 @@ _Are accounts on mastodon.social less likely to remain active than accounts on o library(here) source(here("code/survival.R")) -plot_survival +plot_km ``` -Using accounts created during May 2023, we create Kaplan–Meier estimator for the probability that an account will remain active based on whether the account is on mastodon.social or otherwise if it is on a server in the Join Mastodon list. An account is considered active if it posted a status on or after December 1, 2023 and all accounts which posted after that point are considered censored. +```{r} +#| label: table-coxme +library(ehahelper) +library(broom) -The results suggest that accounts on mastodon.social are less likely to remain active than accounts on other servers, but there is no significant difference between accounts on servers in the Join Mastodon list and other servers. +cxme_table <- tidy(cxme) %>% + mutate(conf.low = exp(conf.low), conf.high=exp(conf.high)) %>% + mutate(term = case_when( + term == "factor(group)1" ~ "Join Mastodon", + term == "factor(group)2" ~ "General Servers", + term == "small_serverTRUE" ~ "Small Server", + TRUE ~ term + )) %>% + mutate(exp.coef = paste("(", round(conf.low, 2), ", ", round(conf.high, 2), ")", sep="")) %>% + select(term, estimate, exp.coef , p.value) +``` + +::: {#tbl-cxme .column-body} +```{r} +if (knitr::is_latex_output()) { + cxme_table %>% knitr::kable(format="latex", booktabs=TRUE, digits=3) +} else { + cxme_table %>% knitr::kable(digits = 3) +} +``` + +Coefficients for the Cox Proportional Hazard Model with Mixed Effects. The model includes a random effect for the server. +::: + +Using accounts created during from May 1 to June 30, 2023, we create a Kaplan–Meier estimator for the probability that an account will remain active based on whether the account is on one of the largest general instances (`r paste(general_servers, collapse=", ")`) featured at the top of the Join Mastodon webpage or otherwise if it is on a server in the Join Mastodon list. Accounts are considered active if they have made at least one post after the censorship period `r active_period` days after account creation. + +We also contruct a Mixed Effects Cox Proportional Hazard Model with coefficients for whether the account is on a small server (less than a hundred accounts), and whether the account in featured on JoinMastodon or is featured as one of the largest general instances. We again find that accounts on the largest general instances are less likely to remain active than accounts on other servers, while accounts created on smaller servers are more likely to remain active. ## Moved Accounts -_Do accounts tend to move to larger or smaller servers?_ +*Do accounts tend to move to larger or smaller servers?* Mastodon users can move their accounts to another server while retaining their connections (but not their posts) to other Mastodon accounts. This feature, built into the Mastodon software, offers data portability and helps avoid lock-in. @@ -307,15 +260,87 @@ if (knitr::is_latex_output()) { # Proposed Recommendation System -_How can we build an opt-in, low-resource recommendation system for finding Fediverse servers?_ +*How can we build an opt-in, low-resource recommendation system for finding Fediverse servers?* Tailored servers focused on a particular topic and community have advantages for onboarding newcomers; however, it may be difficult for new and existing Mastodon users to discover these communities. To address this gap, we propose a recommendation system for finding new servers. This system would be opt-in and low-resource, requiring only a small amount of data from each server. -First, we construct the ideal system based on observted data. That is, we use the data from all posts we collected from all servers to construct an ideal recommender. We then simulate various scenarios that limit both servers that report data and the number of tags they report. We use rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. +First, we construct the ideal system based on observed data. That is, we use the data from all posts we collected from all servers to construct an ideal recommender. We then simulate various scenarios that limit both servers that report data and the number of tags they report. We use rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. ## Recommendation System Design -We use a term frequency-inverse document frequency model to associate the top tags with each server. For the term frequency, we divide the count of the number of accounts which used the tag during the six-month period by the total number of known account-tag pairs on that server; for the inverse document frequency, we divide the total number of servers by count of servers reporting the tag. In this implimentation, we also apply filters such that tags must be used to at least five people on the server to be reported and the tag must be used by at least ten people and at least three servers in the entire known system. +We use Okapi BM25 to construct a term frequency-inverse document frequency (tf-idf) model to associate the top tags with each server using counts of tag-account pairs from each server for the term frequency and the number of servers that use each tag for the inverse document frequency. We then L2 normalize the vectors for each tag and calculate the cosine similarity between the tag vectors for each server. + +$$ +tf = \frac{f_{t,d} \cdot (k_1 + 1)}{f_{t,d} + k_1 \cdot (1 - b + b \cdot \frac{|d|}{avgdl})} +$$ + +where $f_{t,d}$ is the frequency of term $t$ in document $d$, $k_1$ and $b$ are tuning parameters, and $avgdl$ is the average document length. + +$$ +idf = \log \frac{N - n + 0.5}{n + 0.5} +$$ + +where $N$ is the total number of documents and $n$ is the number of documents containing the term. + +$$ +\text{similarity}(A, B) = \frac{A \cdot B}{\|A\| \|B\|} +$$ + +## Applications + +```{r} +#| eval: false +library(tidyverse) +library(igraph) +library(arrow) + +sim_servers <- "data/scratch/server_similarity.feather" %>% arrow::read_ipc_file() %>% rename("weight" = "Similarity") +#sim_net <- as.network(sim_servers) +g <- graph_from_data_frame(sim_servers, directed = FALSE) + +g_strength <- log(sort(strength(g))) +normalized_strength <- (g_strength - min(g_strength)) / (max(g_strength) - min(g_strength)) + +server_centrality <- enframe(normalized_strength, name="server", value="strength") +server_centrality %>% arrow::write_ipc_file("data/scratch/server_centrality.feather") +``` + +::: {#tbl-sim-servers} + +```{r} +#| label: table-sim-servers +library(tidyverse) +library(arrow) + +sim_servers <- "data/scratch/server_similarity.feather" %>% arrow::read_ipc_file() +server_of_interest <- "hci.social" +server_table <- sim_servers %>% + arrange(desc(Similarity)) %>% + filter(Source == server_of_interest | Target == server_of_interest) %>% + head(5) %>% + pivot_longer(cols=c(Source, Target)) %>% + filter(value != server_of_interest) %>% + select(value, Similarity) %>% + rename("Server" = "value") + +if (knitr::is_latex_output()) { + server_table %>% knitr::kable(format="latex", booktabs=TRUE, digits=3) +} else { + server_table %>% knitr::kable(digits = 3) +} +``` + +Top five servers most similar to hci.social + +::: + +### Server Discovery + +This system can empower users to find other servers of potential interest to them. For instance, we can build a system which recommends potential server matches to a new user. + +### Server Neighborhoods + +Mastodon provides two feeds in addition to a user's home timeline populated by accounts they follow: a local timeline with all public posts from their local server and a federated timeline which includes all posts from users followed by other users on their server. We suggest a third kind of timeline, a *neighborhood timeline*, which filters the federated timeline by topic. ## Rubustness to Limited Data @@ -341,21 +366,8 @@ simulations %>% theme_minimal() + theme(legend.position = "none") ``` - We simulated various scenarios that limit both servers that report data and the number of tags they report. We used rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. @fig-simulations-rbo shows how the average agreement with the baseline scales linearly with the logarithm of the tag count. # Conclusion -Based on analysis of trace data from millions of new Fediverse accounts, we find evidence that suggests that servers matter and that users tend to move from larger servers to smaller servers. We then propose a recommendation system that can help new Fediverse users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effectively deployed in a federated manner, even with limited data on each local server. - -# References {#references} - -::: {.content-visible when-format="html"} - -# Appendix {#appendix .appendix} - -## Push and Pull Model - -{{< include notebooks/_push_pull.qmd >}} - -::: \ No newline at end of file +Based on analysis of trace data from millions of new Fediverse accounts, we find evidence that suggests that servers matter and that users tend to move from larger servers to smaller servers. We then propose a recommendation system that can help new Fediverse users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effectively deployed in a federated manner, even with limited data on each local server. \ No newline at end of file diff --git a/code/helpers.R b/code/helpers.R index 87528f8..48e8b46 100644 --- a/code/helpers.R +++ b/code/helpers.R @@ -70,7 +70,8 @@ accounts <- accounts_unfilt %>% mutate(suspended = replace_na(suspended, FALSE)) %>% # sanity check filter(created_at >= "2020-10-01") %>% - filter(created_at < "2024-01-01") %>% + #filter(created_at < "2024-01-01") %>% + filter(created_at < "2023-08-15") %>% # We don't want accounts that were created and then immediately stopped being active filter(statuses_count >= 1) %>% filter(last_status_at >= created_at) %>% @@ -83,7 +84,7 @@ acc_data <- accounts %>% mutate(created_month = format(created_at, "%Y-%m")) %>% mutate(created_week = floor_date(created_at, unit = "week")) %>% mutate(active_now = active) %>% - mutate(active = active_time >= 45) %>% + mutate(active = active_time >= 91) %>% mutate("Is mastodon.social" = server == "mastodon.social") %>% mutate(jm = server %in% jm$domain) %>% group_by(created_week) %>% @@ -95,7 +96,7 @@ acc_data <- accounts %>% active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)), Moved=sum(has_moved)/n(), count=n()) %>% - pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended" + pivot_longer(cols=c("JoinMastodon Server", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended" p1 <- acc_data %>% ggplot(aes(x=as.Date(created_week), group=1)) + @@ -126,5 +127,5 @@ p2 <- acc_data %>% scale_y_continuous(labels = scales::comma) + labs(y="Count", x="Created Week") + theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") -return(p1 + p2 + plot_layout(ncol = 1)) +return(p1 + p2 + plot_layout(ncol = 1, guides = "collect")) } \ No newline at end of file diff --git a/code/scratch/federated_design.py b/code/scratch/federated_design.py index 9800c7c..da77d3a 100644 --- a/code/scratch/federated_design.py +++ b/code/scratch/federated_design.py @@ -5,6 +5,7 @@ import numpy as np import textdistance from scipy.stats import kendalltau import rbo +import scipy def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_matrix: #tag_to_index = {tag: i for i, tag in enumerate(tfidf["tags"].unique().sort().to_list())} @@ -16,15 +17,14 @@ def built_tfidf_matrix(df: pl.DataFrame, tag_to_index, host_to_index) -> lil_mat m[tag_to_index[row["tags"]], host_to_index[row["host"]]] = row["tf_idf"] return m - - class TagData: def __init__(self, servers: set[str], n_tags: int, min_server_accounts: int = 1): + # TODO: minimum tags from server to be included? self.servers = servers self.n_tags = n_tags all_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather").filter( #all_tag_posts = read_tag_posts.filter( - pl.col("created_at") >= pl.date(2023, 2, 1) + pl.col("created_at") >= pl.date(2023, 5, 1) ).filter(pl.col("created_at") < pl.date(2023, 8, 1)).filter( pl.col("host").is_in(servers) ) @@ -35,6 +35,7 @@ class TagData: ).filter(pl.col("running_count") <= n_tags).drop("counter", "running_count").filter(pl.col("accounts") >= min_server_accounts) self._all_tag_posts_topn = all_tag_posts_topn self._server_accounts = all_tag_posts_topn.group_by("host").agg([ + pl.col("tags").len().alias("server_tag_count"), # The total number tags on the server pl.sum("accounts").alias("accounts_sum"), # The total number of account-tag pairs ])#.filter(pl.col("server_accounts") >= 10) #self._server_accounts = all_tag_posts.unique(["host", "acct"]).group_by("host").agg([ @@ -60,10 +61,25 @@ class TagData: ).with_columns( (pl.col("accounts") / pl.col("accounts_sum")).alias("tf") ) - n_servers = len(self._all_tag_posts_topn.unique("host")) - idf = most_seen_tags.with_columns((n_servers/pl.col("server_count")).alias("idf")) + num_servers = len(self._all_tag_posts_topn.unique("host")) + idf = most_seen_tags.with_columns(((1 + num_servers)/(1 + pl.col("server_count"))).log().alias("idf")) tfidf = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True) return tfidf + def bm(self, n_server_accounts=5, n_servers=3, n_accounts=10): + k = 1.2 + b = 0.75 + most_seen_tags = self.most_seen_tags(n_servers, n_accounts) + server_accounts = self.server_accounts(n_server_accounts) + num_servers = len(self._all_tag_posts_topn.unique("host")) + D = server_accounts.rename({"accounts_sum": "D"}).with_columns((pl.col("D") / pl.col("D").mean()).alias("nd")) + tf = td._all_tag_posts_topn.join(D, on="host", how="inner").with_columns( + ((pl.col("accounts") * (k + 1))/(pl.col("accounts") + k*(1-b+b*pl.col("nd")))).alias("tf") + ) + idf = most_seen_tags.with_columns( + (1 + (num_servers - pl.col("server_count") + 0.5)/((pl.col("server_count") + 0.5))).log().alias("idf") + ) + bm = tf.join(idf, on="tags", how="inner").with_columns((pl.col("tf") * pl.col("idf")).alias("tf_idf")).sort("tf_idf", descending=True) + return bm # Constraint: What if we only consider the _top_ 100 tags from each server? @@ -84,10 +100,11 @@ def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData): ).sample(n=n_servers-1)["host"].to_list()) server_is = [baseline_td.host_to_index[i] for i in server_samples] sampled_server_indices = np.array(server_is) - tagdata = TagData(server_samples, n_tags, min_server_accounts=5) - tfidf = tagdata.tfidf(n_server_accounts=5, n_servers=3, n_accounts=10)#n_server_accounts=0, n_servers=2, n_accounts=1) - m = built_tfidf_matrix(tfidf, baseline_td.tag_to_index, baseline_td.host_to_index) - host_sim = cosine_similarity(m.tocsr().T) + tagdata = TagData(server_samples, n_tags, min_server_accounts=2) + tfidf = tagdata.bm(n_server_accounts=5, n_servers=3, n_accounts=10)#n_server_accounts=0, n_servers=2, n_accounts=1) + full_mat = built_tfidf_matrix(tfidf, baseline_td.tag_to_index, baseline_td.host_to_index).T + m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one + host_sim = cosine_similarity(m) rs = [] for serv in server_samples: comp_server_index = baseline_td.host_to_index[serv] @@ -102,11 +119,14 @@ def sampler(host_list, n_servers, n_tags, baseline, baseline_td: TagData): def run_simulations(): #read_tag_posts = pl.read_ipc("data/scratch/all_tag_posts.feather") server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list()) - td = TagData(server_samples, 1_000_000, min_server_accounts=5) - tfidf = td.tfidf() + #td = TagData(server_samples, 1_000_000, min_server_accounts=2) + #tfidf = td.bm(n_server_accounts=5, n_servers=3, n_accounts=10) + td = TagData(server_samples, 256, min_server_accounts=2) + tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10) baseline_host_to_index = td.host_to_index - full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index) - baseline_similarlity = cosine_similarity(full_mat.tocsr().T) + full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T + m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one + baseline_similarlity = cosine_similarity(m) #np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["hci.social"]])][0:10] #np.array(list(td.host_to_index.keys()))[np.argsort(-baseline_similarlity[td.host_to_index["urbanists.social"]])][0:10] host_list = pl.scan_ipc( @@ -114,7 +134,7 @@ def run_simulations(): ).select("host").unique().collect() runs = [] for server_sizes in [256, 128, 64, 32]: # - for tag_counts in [4096, 2048, 1028, 512, 256, 128, 64, 32, 16, 8, 4]: + for tag_counts in [256, 128, 64, 32, 16, 8]: for run in range(128): print(server_sizes, tag_counts, run) s = sampler(host_list, server_sizes, tag_counts, baseline_similarlity, td) @@ -130,6 +150,7 @@ jm_td = td = TagData(jm_servers, 32, min_server_accounts=5) jm_tfidf = jm_td.tfidf(n_server_accounts=5, n_servers=3, n_accounts=10) mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index) similarlity = cosine_similarity(mat.tocsr().T) +# Export server similarity tag_sm = cosine_similarity(mat.tocsr()) tag_index_included = (np.sum(tag_sm, axis=0) > 0) included_tag_strings = np.array(list(jm_td.tag_to_index.keys()))[tag_index_included] @@ -147,4 +168,50 @@ similar_servers = cosine_similarity(np.array(example_indices).reshape(-1,1).T, m np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similar_servers[0])][0:10] #np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similarlity[jm_td.host_to_index["historians.social"]])][0:10] -#np.array(list(jm_td.host_to_index.keys()))[np.where(np.sum(mat, axis=0) < 0.01)[1]] \ No newline at end of file +#np.array(list(jm_td.host_to_index.keys()))[np.where(np.sum(mat, axis=0) < 0.01)[1]] + +server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list()) + +td = TagData(server_samples, 256, min_server_accounts=2) +tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)#.filter(pl.col("accounts") / pl.col("D") > 0.0001) +baseline_host_to_index = td.host_to_index +full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T +#m = (full_mat.T / scipy.sparse.linalg.norm(full_mat.T, ord=2, axis=0)).T + +m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one + +baseline_similarlity = cosine_similarity(m) +l = [] +for i in range(np.shape(baseline_similarlity)[0] - 1): + #s_index = min(i, np.shape(baseline_similarlity)[0] - 1) + l.append( + pl.DataFrame({ + "Source": list(td.host_to_index.keys())[i], + "Target": list(td.host_to_index.keys())[i+1:], + "Similarity": baseline_similarlity[i][i+1:] + }) + ) + +similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0) +similarity_df.write_ipc("data/scratch/server_similarity.feather") + +server = "hci.social" +similarity_df.filter((pl.col("Source") == server) | (pl.col("Target") == server)).sort("Similarity", descending=True)[0:10] +tfidf.filter(pl.col("host") == server)[0:10] + +tfidf.filter(pl.col("tags") == "aoir2023") + +m = (full_mat.T / scipy.sparse.linalg.norm(full_mat.T, ord=2, axis=0)) # good one +tag_similarity = cosine_similarity(m) +l = [] +for i in range(np.shape(tag_similarity)[0] - 1): + #s_index = min(i, np.shape(baseline_similarlity)[0] - 1) + l.append( + pl.DataFrame({ + "Source": list(td.tag_to_index.keys())[i], + "Target": list(td.tag_to_index.keys())[i+1:], + "Similarity": tag_similarity[i][i+1:] + }) + ) + +tag_similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0) \ No newline at end of file diff --git a/code/survival.R b/code/survival.R index a38c149..6135b9c 100644 --- a/code/survival.R +++ b/code/survival.R @@ -1,10 +1,15 @@ library(here) library(survival) +library(tidyverse) library(ggsurvfit) +library(coxme) +library(jsonlite) source(here("code/helpers.R")) options(arrow.skip_nul = TRUE) +active_period <- 91 + a <- load_accounts() %>% filter(!has_moved) %>% filter(locked == FALSE) %>% @@ -13,43 +18,73 @@ a <- load_accounts() %>% filter(created_at > "2023-04-30") %>% filter(created_at <= "2023-06-30") %>% filter(created_at < last_status_at) %>% - mutate(jm = server %in% arrow::read_feather(here("data/scratch/joinmastodon.feather"))$domain) %>% - mutate(active = last_status_at >= as.Date("2023-12-01")) %>% - #mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2023-09-01 00:00:00", tz = "UTC"), last_status_at)) %>% - mutate(active_time = difftime(ifelse(active, lubridate::ymd_hms("2023-12-01 00:00:00", tz = "UTC"), last_status_at), created_at, units="days")) %>% + mutate(jm = server %in% as_tibble(jsonlite::fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain) %>% + mutate(active_time = difftime(last_status_at, created_at, units="days")) %>% + mutate(active_time_censored = ifelse(active_time > active_period, active_period, active_time)) %>% #mutate(active_time = difftime(last_status_at, created_at, units="days")) %>% - mutate(status = ifelse(active, 0, 1))# %>% filter(followers_count > 0) %>% filter(following_count > 0) + #0=alive, 1=dead. + mutate(status = ifelse(active_time > active_period, 0, 1))# %>% filter(followers_count > 0) %>% filter(following_count > 0) + +a %>% select(status, created_at, last_status_at, active_time_censored) + +activity <- arrow::read_feather( + "data/scratch/activity.feather", + col_select = c("server", "logins") + ) %>% + arrange(desc(logins)) server_summary <- a %>% group_by(server) %>% summarize(cohort_size = n(), .groups = "drop") +general_servers <- c( + "mastodon.social", + "mstdn.social", + "mastodon.world", + "mas.to", + "mastodon.online", + "mastodonapp.uk", + "universeodon.com", + "masto.ai", + "c.im", + "social.vivaldi.net", + "mstdn.party", + "ohai.social" +) # > 100 cohort size + strength > .75 + sel_a <- a %>% mutate(is_ms = server == "mastodon.social") %>% + mutate(general = server %in% general_servers) %>% ungroup() %>% - inner_join(server_summary, by = "server") %>% filter(!noindex) #%>% filter(user_count > 100) + inner_join(server_summary, by = "server") %>% + mutate(small_server = user_count <= 100) %>% + mutate(large_server = user_count >= 1000) %>% + #filter(jm) %>% + mutate(group = as.integer(jm) + as.integer(general)) %>% + #mutate(huge_server = user_count >= 3162.28) %>% + #filter(statuses_count >= 10) %>% + filter(!noindex)# %>% filter(cohort_size >= 2) #%>% filter(user_count > 100) -cx <- sel_a %>% - coxph(Surv(active_time, status) ~ is_ms + jm, data = ., x=TRUE, robust = T, cluster=server) +cx <- sel_a %>% coxph(Surv(active_time_censored, status) ~ general, data = ., x=TRUE, robust = T, cluster=server) +cxme <- sel_a %>% coxme(Surv(active_time_censored, status) ~ factor(group) + small_server + (1|server), data = .) cz <- cox.zph(cx) -plot_survival <- sel_a %>% - #filter(followers_count > 0) %>% - #filter(following_count > 0) %>% +plot_km <- sel_a %>% mutate(id = paste(username, server, sep = "@")) %>% survfit2( - Surv(active_time, status) ~ jm + is_ms, # is_jm + Surv(active_time_censored, status) ~ general, data = ., id = id, cluster = server, robust = TRUE ) %>% ggsurvfit() + + add_confidence_interval() + labs( y = "Overall survival probability", x = "Time (days)", ) + - scale_fill_discrete(name = "Group", labels = c("Not in JM", "JM", "mastodon.social")) + - scale_color_discrete(name = "Group", labels = c("Not in JM", "JM", "mastodon.social")) + + scale_fill_discrete(name = "Group", labels = c("Other JoinMastodon", "General")) + + scale_color_discrete(name = "Group", labels = c("Other JoinMastodon", "General")) + theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom") diff --git a/extended-abstract.qmd b/extended-abstract.qmd new file mode 100644 index 0000000..ffccef4 --- /dev/null +++ b/extended-abstract.qmd @@ -0,0 +1,154 @@ +--- +title: "Do Servers Matter on Mastodon? Data-driven Design for Decentralized Social Media" +author: Carl Colglazier +bibliography: references.bib +format: + ic2s2-pdf: default +execute: + echo: false + error: false + warning: false + message: false + cache: true +knitr: + opts_knit: + verbose: true +--- + +```{r, cache.extra = tools::md5sum("code/helpers.R")} +#| label: fig-account-timeline +#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts." +#| fig-height: 2.75 +#| fig-width: 6.75 +#| fig-env: figure* +#| fig-pos: tb! + +library(here) +source(here("code/helpers.R")) +account_timeline_plot() +``` + +Following Twitter's 2022 acquisition, Mastodon---an open-source, decentralized social network and microblogging community---saw an increase in activity and attention as a potential Twitter alternative [@heFlockingMastodonTracking2023; @cavaDriversSocialInfluence2023]. While millions of people set up new accounts and significantly increased the size of the network (@fig-account-timeline), many of these newcomers and potential newcomers found the process confusing and many accounts did not remain active. Unlike centralized social media platforms, Mastodon is a network of independent servers with their own rules and norms [@nicholsonMastodonRulesCharacterizing2023]. Each server can communicate with each other using the shared ActivityPub protocols and accounts can move between Mastodon servers, but the local experience can vary widely from server to server. + +Although attracting and retaining newcomers is a key challenge for online communities [@krautBuildingSuccessfulOnline2011 p. 182], Mastodon's onboarding process has not always been straightforward. Variation among servers can also present a challenge for newcomers who may not even be aware of the specific rules, norms, or general topics of interest on the server they are joining [@diazUsingMastodonWay2022]. Further, many Mastodon servers have specific norms which people coming from Twitter may find confusing, such as local norms around content warnings [@nicholsonMastodonRulesCharacterizing2023]. Various guides and resources for people trying to join Mastodon offered mixed advice on choosing a server. Some suggest that the most important thing is to simply join any server and work from there [@krasnoffMastodon101How2022; @silberlingBeginnerGuideMastodon2023], while others have created tools and guides to help people find potential servers of interest by size and location[@thekinrarMastodonInstances; @kingMastodonMe2024]. + +Mastodon's approach to onboarding has also changed over time. In much of 2020 and early 2021, the Mastodon developers closed signups to their flagship server and linked to an alternative server, which saw increased sign-ups during this period. They also linked to a list of servers on the Join Mastodon webpage [@mastodonggmbhServers], where all servers are pre-approved and follow the Mastodon Server Covenant which guarantees certain content moderation standards and data protections. Starting in 2023, the Mastodon developers shifted toward making the flagship server the default when people sign up on the official Mastodon Android and iOS apps [@rochkoNewOnboardingExperience2023; @rothItGettingEasier2023]. + +We first ask question: *Does server choice matter for Mastodon newcomers?* Toward this question, we used profile data from over a million Mastodon accounts collected from public timelines and profile directories between October 1, 2020 and August 15, 2023. With a subset of these accounts created from May 1 to June 30, 2023, we create a Kaplan–Meier estimator for account activity in the 91 days after creation (@fig-survival). We find that accounts on the 12 largest general instances featured at the top of the Join Mastodon webpage (which includes the flagship server) are less likely to remain active than accounts created on other Join Mastodon servers. + +To corroborate this model, we also use data from thousands of accounts which moved between Mastodon servers, taking advantage of the data portability of the platform. Conceiving of these moved accounts as edges within a weighted directional network where nodes represent servers, edges represent accounts, and weights represent the number of accounts that moved between servers, we construct an exponential family random graph model (ERGM) with terms for server size, open registrations, and language match between servers. We find that accounts are more likely to move from larger servers to smaller servers. + +```{=html} + +``` +Based on these findings, we suggest a need for better ways for potential newcomers to find servers and propose a viable way to create server and tag recommendations on Mastodon, which could both help newcomers find servers that match their interests and help established accounts discover "neighborhoods" of related servers. One challenge in building such a system is the decentralized nature of the system. A single, central actor which collects data from servers and then distributes recommendations would be antithetical to the decentralized nature of Mastodon. Instead, we propose a system where servers can report the top hashtags by the number of unique accounts on the server using them during the last three months. Such a system would be opt-in and require few additional server resources since tags already have their own database table. + +In our proposal, after collecting these top tags on each server, each server then uses Okapi BM25 to construct a term frequency-inverse document frequency (TF-IDF) matrix to associate the top tags with each server in their known network. We suggest first filtering to only consider tags used by a minimal number of account on a server and only consider tags used on a minimal number of servers. The counts of tag-account pairs from each server make up the term frequency and the number of servers that use each tag make up the inverse document frequency. The system can then apply L2 normalization to the vectors for each tag and calculate the cosine similarity between the tag vectors for each server. To find similarity between tags, the system could also calculate the cosine similarity between the server vectors. + +To determine the viability of the recommendation system, we simulated various scenarios that limit both servers that report data and the number of tags they report. We used rank biased overlap (RBO) to then compare the outputs from these simulations to the baseline with more complete information from all tags on all servers. @fig-simulations-rbo shows how the average agreement with the baseline scales linearly with the logarithm of the tag count. + +Thus based on analysis of trace data from millions of new Mastodon accounts, we find evidence that suggests that servers matter and that users tend to move from larger servers to smaller servers. We then propose a recommendation system that can help new Mastodon users find servers with a high probability of being a good match based on their interests. Based on simulations, we demonstrate that such a tool can be effectively deployed in a federated manner, even with limited data on each local server. + +```{r, cache.extra = tools::md5sum("code/survival.R")} +#| cache: true +#| label: fig-survival +#| fig-cap: "Survival probabilities for accounts created during May and June 2023 on servers featured on Join Mastodon. Groups represent whether the account is on one of the 12 largest and most prominently featured servers or another Join Mastodon server." +library(here) +source(here("code/survival.R")) +plot_km +``` + +::: {#tbl-ergm-table} +```{r} +#| label: table-ergm-table +#| echo: false +#| warning: false +#| message: false +#| error: false + +library(here) +library(modelsummary) +library(kableExtra) +library(purrr) +library(stringr) +load(file = here("data/scratch/ergm-model-early.rda")) +load(file = here("data/scratch/ergm-model-late.rda")) + +x <- modelsummary( + list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late), + estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"), + statistic = NULL, + gof_omit = ".*", + coef_rename = c( + "sum" = "Sum", + "nonzero" = "Nonzero", + "diff.sum0.h-t.accounts" = "Smaller server", + "nodeocov.sum.accounts" = "Server size\n(outgoing)", + "nodeifactor.sum.registrations.TRUE" = "Open registrations\n(incoming)", + "nodematch.sum.language" = "Languages match" + ), + align="lrrrr", + stars = c('*' = .05, '**' = 0.01, '***' = .001), + output = "latex_tabular" + #output = "markdown", + #table.envir='table*', + #table.env="table*" + ) %>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2)) + +x +``` + +Exponential family random graph models for account movement between Mastodon servers. Accounts in Model A were created in May 2022 and moved to another account at some later point. Accounts in Model B were created at some earlier point and moved after October 2023. +::: + +::: {#tbl-sim-servers} +```{r} +#| label: table-sim-servers +library(tidyverse) +library(arrow) + +sim_servers <- "data/scratch/server_similarity.feather" %>% arrow::read_ipc_file() +server_of_interest <- "hci.social" +server_table <- sim_servers %>% + arrange(desc(Similarity)) %>% + filter(Source == server_of_interest | Target == server_of_interest) %>% + head(5) %>% + pivot_longer(cols=c(Source, Target)) %>% + filter(value != server_of_interest) %>% + select(value, Similarity) %>% + rename("Server" = "value") + +if (knitr::is_latex_output()) { + server_table %>% knitr::kable(format="latex", booktabs=TRUE, digits=3) +} else { + server_table %>% knitr::kable(digits = 3) +} +``` + +Top five servers most similar to hci.social, a Mastodon server focused on human-computer interaction research. Each of these servers relate to computer science, academia, or technology. +::: + +```{r} +#| label: fig-simulations-rbo +#| fig-env: figure* +#| cache: true +#| fig-width: 6.75 +#| fig-height: 3 +#| fig-pos: tb +#| fig-cap: "Simulated rank biased overlap between simulated server similarity ranks varied by the number of tags reported by each server and the number of servers that report data. The baseline uses 256 tags." +library(tidyverse) +library(arrow) +simulations <- arrow::read_ipc_file("data/scratch/simulation_rbo.feather") + +simulations %>% + group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>% + mutate(ltags = as.integer(log2(tags))) %>% + ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) + + geom_boxplot() + + facet_wrap(~servers, nrow=1) + + #scale_y_continuous(limits = c(0, 1)) + + labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") + + theme_minimal() + theme(legend.position = "none") +``` \ No newline at end of file diff --git a/notebooks/_moved.qmd b/notebooks/_moved.qmd index 9eb4beb..1d2dca6 100644 --- a/notebooks/_moved.qmd +++ b/notebooks/_moved.qmd @@ -215,4 +215,43 @@ modelsummary( stars = c('*' = .05, '**' = 0.01, '***' = .001), ) %>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2)) -``` \ No newline at end of file +``` + +```{r} +#| eval: false +jm2023 <- as_tibble(fromJSON(here("data/joinmastodon-2023-08-25.json"))) + +move_counts.stable <- inner_join(moved_accounts, accounts, by=c("moved_server"="server", "moved_acct"="username")) %>% + filter(server %in% is_mastodon$server) %>% + filter(moved_server %in% is_mastodon$server) %>% + filter(created_at >= as.Date("2023-05-01")) %>% + filter(created_at < as.Date("2023-08-01")) %>% + filter(server != moved_server) %>% + group_by(server, moved_server) %>% summarize(count = n()) %>% arrange(desc(count)) %>% + ungroup() %>% + filter(server %in% jm2023$domain) %>% + filter(moved_server %in% jm2023$domain) + +run_network2 <- function(network) { + model <- + ergm( + network ~ sum + nonzero + + #diff("last_week_users", dir="h-t", pow = 0, form = "sum") + + nodeocovar(center=TRUE,transform="sqrt") + + diff("accounts", dir="h-t", pow = 0, form = "sum") + # Do people move to smaller servers? + nodeocov("accounts", form = "sum") + # Do servers with more accounts have more outflow? + nodeifactor("registrations", form = "sum") + # Do servers with open registration get more inflow? + nodematch("language", form = "sum"), + response = "count", + reference = ~ Binomial(5), + control = control.ergm(MCMLE.maxit = 100, MCMC.effectiveSize = 50) + ) + return(model) +} + + +move_counts.stable +edgeNet.stable <- build_network(move_counts.stable, metadata, activity_data) +model.stable <- run_network2(edgeNet.stable) +save(model.stable, file = here("data/scratch/ergm-model-stable.rda")) +``` diff --git a/notebooks/_push_pull.qmd b/notebooks/_push_pull.qmd index 504b2c6..163f3cd 100644 --- a/notebooks/_push_pull.qmd +++ b/notebooks/_push_pull.qmd @@ -9,10 +9,11 @@ library(tsibble) library(fable) library(lmtest) library(jsonlite) +library(here) -source("code/helpers.R") +source(here("code/helpers.R")) accounts <- load_accounts() -jm <- arrow::read_feather("data/scratch/joinmastodon.feather") +jm <- arrow::read_feather(here("data/scratch/joinmastodon.feather")) ``` ```{r} @@ -21,7 +22,7 @@ server_list <- c( "mastodon.social", "mastodon.online" ) -early.jm_servers <- as_tibble(fromJSON("data/joinmastodon-2020-09-18.json"))$domain +early.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2020-09-18.json")))$domain early.day_counts <- accounts %>% filter(created_at < "2021-09-01") %>% @@ -97,7 +98,7 @@ early.table <- fit %>% tidy %>% ```{r} #| label: prep-break-two-raw-counts -email.jm_servers <- as_tibble(fromJSON("data/joinmastodon-2023-08-25.json"))$domain +email.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain email.day_counts <- accounts %>% filter(created_at > "2022-07-01") %>% @@ -180,11 +181,11 @@ email.table <- fit %>% tidy %>% ```{r} #| label: prep-break-three-raw-counts -late.jm_servers <- as_tibble(fromJSON("data/joinmastodon-2023-08-25.json"))$domain +late.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain last.day_counts <- accounts %>% filter(created_at > "2022-12-01") %>% - filter(created_at < "2023-06-01") %>% + filter(created_at < "2023-05-01") %>% mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>% mutate(server_code = ifelse(server %in% late.jm_servers, "joinmastodon", "other")) %>% mutate(server_code = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>% diff --git a/notebooks/arima.qmd b/notebooks/arima.qmd new file mode 100644 index 0000000..48477fa --- /dev/null +++ b/notebooks/arima.qmd @@ -0,0 +1,55 @@ +--- +title: "Relationship between the flagship server and others over time" +resource-path: + - "../" +--- + +## Competition Among Servers in Attracting Newcomers + +_How does mastodon.social factor into the aggregate Mastodon onboarding process?_ + +![](images/mastodon-social-signups-2020-11-01.png){fig-env="figure" width=6cm height=6cm} + +: The main page of mastodon.social as viewed by a logged out web browser on November 1, 2020. The sign-up form is blurred out and instead there is a message suggesting to either sign up on mastodon.online or see a list of servers accepting new accounts at joinmastodon.org. + +Throughout its history, Mastodon's flagship server, mastodon.social, has allowed and disallowed open sign-ups at various times. When the website did not allow sign-ups, it displayed a message redirecting those interested in signing up for an account to mastodon.social or alternatively to a list of potential servers at joinmastodon.com. + +We found three main periods during which mastodon.social did not accept new signups by first noting the times in @fig-account-timeline where the proportion of new accounts on mastodon.social drops to zero. We then used the Internet Archive to verify that signups were disabled during these periods. + +1. An extended period of through the end of October 2020. + +2. A temporary issue when the email host limited the server in mid-2022. + +3. Two periods in late 2022 and early 2023. + +We construct an interrupted time series using an autoregressive integrated moving average (ARIMA) model for sign-ups on mastodon.social, the servers linked in joinmastodon.org, and all other servers. For the first period, we also include mastodon.online since mastodon.social linked to it directly during that time. + +$$ +\begin{aligned} +y_t &= \beta_0 + \beta_1 \text{open}_t + \beta_2 \text{day}_t + \beta_3 (\text{open} \times \text{day})_t \\ +&\quad + \beta_4 \sin\left(\frac{2\pi t}{7}\right) + \beta_5 \cos\left(\frac{2\pi t}{7}\right) \\ +&\quad + \beta_6 \sin\left(\frac{4\pi t}{7}\right) + \beta_7 \cos\left(\frac{4\pi t}{7}\right) \\ +&\quad + \phi_1 y_{t-1} + \phi_2 y_{t-2} + \epsilon_t +\end{aligned} +$$ + +where $y_t$ is the number of new accounts on a server at time $t$, $\text{open}_t$ is a binary variable indicating if the server is open to new sign-ups, $\text{day}_t$ is an increasing integer represnting the date, and $\epsilon_t$ is a white noise error term. We use the sine and cosine terms to account for weekly seasonality. + + +| Period | Setting | Significant | +|------------|:----------------|:----| +| 2020-2021 | mastodon.online | Yes | +| | JoinMastodon | No | +| | Other | No | +| Mid 2022 | JoinMastodon | No | +| | Other | No | +| Early 2022 | JoinMastodon | No | +| | Other | No | + +: Results from ARIMA models for the number of new accounts on mastodon.social, mastodon.online, servers linked in joinmastodon.org, and all other servers. + +# Appendix {#appendix .appendix} + +## Push and Pull Model + +{{< include _push_pull.qmd >}} diff --git a/references.bib b/references.bib index 35bf750..96f1591 100644 --- a/references.bib +++ b/references.bib @@ -6,8 +6,8 @@ year = {2009}, series = {{{CHI}} '09}, pages = {945--954}, - publisher = {{ACM}}, - address = {{New York, NY, USA}}, + publisher = {ACM}, + address = {New York, NY, USA}, doi = {10.1145/1518701.1518847}, urldate = {2017-08-02}, abstract = {Social networking sites (SNS) are only as good as the content their users share. Therefore, designers of SNS seek to improve the overall user experience by encouraging members to contribute more content. However, user motivations for contribution in SNS are not well understood. This is particularly true for newcomers, who may not recognize the value of contribution. Using server log data from approximately 140,000 newcomers in Facebook, we predict long-term sharing based on the experiences the newcomers have in their first two weeks. We test four mechanisms: social learning, singling out, feedback, and distribution. In particular, we find support for social learning: newcomers who see their friends contributing go on to share more content themselves. For newcomers who are initially inclined to contribute, receiving feedback and having a wide audience are also predictors of increased sharing. On the other hand, singling out appears to affect only those newcomers who are not initially inclined to share. The paper concludes with design implications for motivating newcomer sharing in online communities.}, @@ -77,8 +77,8 @@ month = oct, series = {{{IMC}} '23}, pages = {111--123}, - publisher = {{Association for Computing Machinery}}, - address = {{New York, NY, USA}}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, doi = {10.1145/3618257.3624819}, urldate = {2024-02-22}, abstract = {The acquisition of Twitter by Elon Musk has spurred controversy and uncertainty among Twitter users. The move raised both praise and concerns, particularly regarding Musk's views on free speech. As a result, a large number of Twitter users have looked for alternatives to Twitter. Mastodon, a decentralized micro-blogging social network, has attracted the attention of many users and the general media. In this paper, we analyze the migration of 136,009 users from Twitter to Mastodon. We inspect the impact that this has on the wider Mastodon ecosystem, particularly in terms of user-driven pressure towards centralization. We further explore factors that influence users to migrate, highlighting the effect of users' social networks. Finally, we inspect the behavior of individual users, showing how they utilize both Twitter and Mastodon in parallel. We find a clear difference in the topics discussed on the two platforms. This leads us to build classifiers to explore if migration is predictable. Through feature analysis, we find that the content of tweets as well as the number of URLs, the number of likes, and the length of tweets are effective metrics for the prediction of user migration.}, @@ -100,19 +100,53 @@ keywords = {communities,content moderation,elon musk,mastodon,platforms,social,social media,twitter} } +@misc{kingMastodonMe2024, + title = {Mastodon {{Near Me}}}, + author = {King, Jaz-Michael}, + year = {2024}, + month = jan, + journal = {jaz-michael king}, + urldate = {2024-03-04}, + abstract = {A map and data directory showcasing ActivityPub service providers, each specifically catering to a certain locality or offering support in a notable language.}, + langid = {english} +} + +@misc{krasnoffMastodon101How2022, + title = {Mastodon 101: How to Follow (and Unfollow) Other Accounts}, + shorttitle = {Mastodon 101}, + author = {Krasnoff, Barbara}, + year = {2022}, + month = dec, + journal = {The Verge}, + urldate = {2024-03-04}, + abstract = {How to get started in Mastodon by following other people}, + howpublished = {https://www.theverge.com/23519279/mastodon-instance-follow-friend}, + langid = {english} +} + @book{krautBuildingSuccessfulOnline2011, ids = {kraut_building_2011,kraut_building_2011-1,kraut_building_2011-3}, title = {Building {{Successful Online Communities}}: {{Evidence-Based Social Design}}}, shorttitle = {Building {{Successful Online Communities}}}, author = {Kraut, Robert E. and Resnick, Paul and Kiesler, Sara}, year = {2011}, - publisher = {{MIT Press}}, - address = {{Cambridge, Mass}}, + publisher = {MIT Press}, + address = {Cambridge, Mass}, isbn = {978-0-262-01657-5}, lccn = {HM742 .K73 2011}, keywords = {Computer networks,internet,Online social networks,Planning,Social aspects,Social aspects Planning,Social psychology} } +@misc{mastodonggmbhServers, + title = {Servers}, + author = {{Mastodon gGmbH}}, + journal = {Join Mastodon}, + urldate = {2024-03-04}, + abstract = {Find where to sign up for the decentralized social network Mastodon.}, + howpublished = {https://joinmastodon.org/servers}, + langid = {english} +} + @article{newellUserMigrationOnline2021, title = {User {{Migration}} in {{Online Social Networks}}: {{A Case Study}} on {{Reddit During}} a {{Period}} of {{Community Unrest}}}, author = {Newell, Edward and Jurgens, David and Saleem, Haji Mohammad and Vala, Hardik and Sassine, Jad and Armstrong, Caitrin and Ruths, Derek}, @@ -134,15 +168,48 @@ month = oct, series = {{{CSCW}} '23 {{Companion}}}, pages = {86--90}, - publisher = {{Association for Computing Machinery}}, - address = {{New York, NY, USA}}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, doi = {10.1145/3584931.3606970}, urldate = {2024-01-16}, - abstract = {Federated social networking is an increasingly popular alternative to more traditional, centralized forms. Yet, this federated arrangement can lead to dramatically different experiences across the network. Using a sample of the most popular instances on the federated social network Mastodon, we characterize the types of rules present in this emerging space. We then compare these rules to those on Reddit, as an example of a different, less centralized space. Rules on Mastodon often pay particular attention to issues of harassment and hate {\textemdash} strongly reflecting the spirit of the Mastodon Covenant. We speculate that these rules may have emerged in response to problems of other platforms, and reflect a lack of support for instance maintainers. With this work, we call for the development of additional instance-level governance and technical scaffolding, and raise questions for future work into the development, values, and value tensions present in the broader federated social networking landscape.}, + abstract = {Federated social networking is an increasingly popular alternative to more traditional, centralized forms. Yet, this federated arrangement can lead to dramatically different experiences across the network. Using a sample of the most popular instances on the federated social network Mastodon, we characterize the types of rules present in this emerging space. We then compare these rules to those on Reddit, as an example of a different, less centralized space. Rules on Mastodon often pay particular attention to issues of harassment and hate --- strongly reflecting the spirit of the Mastodon Covenant. We speculate that these rules may have emerged in response to problems of other platforms, and reflect a lack of support for instance maintainers. With this work, we call for the development of additional instance-level governance and technical scaffolding, and raise questions for future work into the development, values, and value tensions present in the broader federated social networking landscape.}, isbn = {9798400701290}, keywords = {community rules,Mastodon,online communities} } +@misc{rochkoNewOnboardingExperience2023, + title = {A New Onboarding Experience on {{Mastodon}}}, + author = {Rochko, Eugen}, + year = {2023}, + month = may, + journal = {Mastodon Blog}, + urldate = {2024-03-04}, + abstract = {Today we're making signing up on Mastodon easier than ever before. We understand that deciding which Mastodon service provider to kick off your experience with can be confusing. We know this is a completely new concept for many people, since traditionally the platform and the service provider are one and the same. This choice is what makes Mastodon different from existing social networks, but it also presents a unique onboarding challenge.}, + howpublished = {https://blog.joinmastodon.org/2023/05/a-new-onboarding-experience-on-mastodon/} +} + +@misc{rothItGettingEasier2023, + title = {It's Getting Easier to Make an Account on {{Mastodon}}}, + author = {Roth, Emma}, + year = {2023}, + month = may, + journal = {The Verge}, + urldate = {2024-03-04}, + abstract = {The network lets you sign up for mastodon.social from the start.}, + howpublished = {https://www.theverge.com/2023/5/1/23707019/mastodon-account-creation-twitter-alternative}, + langid = {english} +} + +@misc{silberlingBeginnerGuideMastodon2023, + title = {A Beginner's Guide to {{Mastodon}}, the Open Source {{Twitter}} Alternative {\textbar} {{TechCrunch}}}, + author = {Silberling, Amanda}, + year = {2023}, + month = jul, + journal = {TechCrunch}, + urldate = {2024-03-04}, + howpublished = {https://techcrunch.com/2023/07/24/what-is-mastodon/} +} + @inproceedings{teblunthuisIdentifyingCompetitionMutualism2022, title = {Identifying Competition and Mutualism between Online Groups}, booktitle = {International {{AAAI Conference}} on {{Web}} and {{Social Media}} ({{ICWSM}} 2022)}, @@ -151,13 +218,21 @@ month = jun, volume = {16}, pages = {993--1004}, - publisher = {{AAAI}}, - address = {{Atlanta, Georgia, USA}}, + publisher = {AAAI}, + address = {Atlanta, Georgia, USA}, urldate = {2021-07-16}, abstract = {Platforms often host multiple online groups with highly overlapping topics and members. How can researchers and designers understand how interactions between related groups affect measures of group health? Inspired by population ecology, prior social computing research has studied competition and mutualism among related groups by correlating group size with degrees of overlap in content and membership. The resulting body of evidence is puzzling as overlaps seem sometimes to help and other times to hurt. We suggest that this confusion results from aggregating inter-group relationships into an overall environmental effect instead of focusing on networks of competition and mutualism among groups. We propose a theoretical framework based on community ecology and a method for inferring competitive and mutualistic interactions from time series participation data. We compare population and community ecology analyses of online community growth by analyzing clusters of subreddits with high user overlap but varying degrees of competition and mutualism.}, keywords = {Computer Science - Human-Computer Interaction,Computer Science - Social and Information Networks} } +@misc{thekinrarMastodonInstances, + title = {Mastodon Instances}, + author = {TheKinrar}, + journal = {instances.social}, + urldate = {2024-03-04}, + howpublished = {https://instances.social/} +} + @article{webberSimilarityMeasureIndefinite2010, title = {A Similarity Measure for Indefinite Rankings}, author = {Webber, William and Moffat, Alistair and Zobel, Justin}, @@ -184,7 +259,7 @@ volume = {22}, number = {7}, pages = {1188--1205}, - publisher = {{SAGE Publications}}, + publisher = {SAGE Publications}, issn = {1461-4448}, doi = {10.1177/1461444820912533}, urldate = {2022-03-13},