443 lines
14 KiB
Plaintext
443 lines
14 KiB
Plaintext
---
|
|
title: "Onboarding the Fediverse"
|
|
subtitle: "Building community discovery in decentralized online social networks"
|
|
author: "Carl Colglazier"
|
|
format:
|
|
revealjs:
|
|
theme: presentation.scss
|
|
keep-md: true
|
|
knitr:
|
|
opts_chunk:
|
|
dev: "ragg_png"
|
|
retina: 1
|
|
dpi: 200
|
|
execute:
|
|
freeze: auto
|
|
cache: true
|
|
echo: false
|
|
fig-width: 5
|
|
fig-height: 6
|
|
---
|
|
|
|
## Growth on the Fediverse
|
|
|
|
|
|
```{r}
|
|
#| label: fig-account-timeline
|
|
#| fig-height: 3
|
|
#| fig-width: 6.75
|
|
|
|
library(arrow)
|
|
library(tidyverse)
|
|
library(lubridate)
|
|
library(scales)
|
|
library(here)
|
|
source(here("code/helpers.R"))
|
|
|
|
jm <- arrow::read_feather(here("data/scratch/joinmastodon.feather"))
|
|
moved_to <- arrow::read_feather(here("data/scratch/individual_moved_accounts.feather"))
|
|
accounts_unfilt <- arrow::read_feather(
|
|
here("data/scratch/all_accounts.feather"),
|
|
col_select=c(
|
|
"server", "username", "created_at", "last_status_at",
|
|
"statuses_count", "has_moved", "bot", "suspended",
|
|
"following_count", "followers_count", "locked",
|
|
"noindex", "group", "discoverable"
|
|
))
|
|
accounts <- accounts_unfilt %>%
|
|
filter(!bot) %>%
|
|
# TODO: what's going on here?
|
|
filter(!is.na(last_status_at)) %>%
|
|
mutate(suspended = replace_na(suspended, FALSE)) %>%
|
|
# sanity check
|
|
filter(created_at >= "2020-10-01") %>%
|
|
filter(created_at < "2024-01-01") %>%
|
|
# We don't want accounts that were created and then immediately stopped being active
|
|
filter(statuses_count >= 1) %>%
|
|
filter(last_status_at >= created_at) %>%
|
|
mutate(active = last_status_at >= "2024-01-01") %>%
|
|
mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>%
|
|
mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>%
|
|
#filter(!has_moved)
|
|
acc_data <- accounts %>%
|
|
#filter(!has_moved) %>%
|
|
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
|
mutate(created_week = floor_date(created_at, unit = "week")) %>%
|
|
mutate(active_now = active) %>%
|
|
mutate(active = active_time >= 45) %>%
|
|
mutate("Is mastodon.social" = server == "mastodon.social") %>%
|
|
mutate(jm = server %in% jm$domain) %>%
|
|
group_by(created_week) %>%
|
|
summarize(
|
|
`JoinMastodon Server` = sum(jm) / n(),
|
|
`Is mastodon.social` = sum(`Is mastodon.social`)/n(),
|
|
Suspended = sum(suspended)/n(),
|
|
Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
|
active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
|
Moved=sum(has_moved)/n(),
|
|
count=n()) %>%
|
|
pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
|
|
|
|
p1 <- acc_data %>%
|
|
ggplot(aes(x=as.Date(created_week), group=1)) +
|
|
geom_line(aes(y=value, group=Measure, color=Measure)) +
|
|
geom_point(aes(y=value, color=Measure), size=0.7) +
|
|
scale_y_continuous(limits = c(0, 1.0)) +
|
|
labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") +
|
|
theme_bw_small_labels() +
|
|
theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
|
|
p2 <- acc_data %>%
|
|
distinct(created_week, count) %>%
|
|
ggplot(aes(x=as.Date(created_week), y=count)) +
|
|
geom_bar(stat="identity", fill="black") +
|
|
geom_vline(
|
|
aes(xintercept = as.numeric(as.Date("2022-10-27"))),
|
|
linetype="dashed", color = "black") +
|
|
geom_vline(
|
|
aes(xintercept = as.numeric(as.Date("2022-04-14"))),
|
|
linetype="dashed", color = "black") +
|
|
# https://twitter.com/elonmusk/status/1675187969420828672
|
|
geom_vline(
|
|
aes(xintercept = as.numeric(as.Date("2022-12-15"))),
|
|
linetype="dashed", color = "black") +
|
|
geom_vline(
|
|
aes(xintercept = as.numeric(as.Date("2023-07-01"))),
|
|
linetype="dashed", color = "black") +
|
|
#scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) +
|
|
scale_y_continuous(labels = scales::comma) +
|
|
labs(y="Count", x="Created Week") +
|
|
theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week")
|
|
library(patchwork)
|
|
p1 + p2 + plot_layout(ncol = 1)
|
|
```
|
|
|
|
## The Million Account Elephant in the Room
|
|
|
|
::::: {.columns}
|
|
|
|
::: {.column width="40%"}
|
|

|
|
:::
|
|
|
|
:::: {.column width="60%"}
|
|
|
|
::: {.smaller}
|
|
|
|
Mastodon.social (MS), the flagship server from the Mastodon developers, has always been the largest Mastodon server.
|
|
|
|
The server has been closed to new open registrations many times throughout the years.
|
|
|
|
:::
|
|
|
|
::::
|
|
|
|
:::::
|
|
|
|
## Closure and Opening of MS (2022) {.tiny}
|
|
|
|
|
|
```{r}
|
|
#| fig-width: 9
|
|
library(jsonlite)
|
|
library(here)
|
|
library(tidyverse)
|
|
library(tsibble)
|
|
library(fable)
|
|
|
|
server_list <- c(
|
|
"mastodon.social", "mastodon.online"
|
|
)
|
|
|
|
early.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2020-09-18.json")))$domain
|
|
|
|
early.day_counts <- accounts %>%
|
|
filter(created_at < "2021-09-01") %>%
|
|
mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
|
|
mutate(server_code = ifelse(server %in% early.jm_servers, "joinmastodon", "other")) %>%
|
|
mutate(server_code = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
|
|
mutate(server = ifelse(server == "mastodon.online", "mastodon.online", server_code)) %>%
|
|
group_by(created_day, server) %>%
|
|
summarize(count = n(), .groups = "drop") %>%
|
|
as_tsibble(., key=server, index=created_day) %>%
|
|
fill_gaps(count=0) %>%
|
|
mutate(first_open = ((created_day >= "2020-09-18") & (created_day < "2020-11-01"))) %>%
|
|
#mutate(second_open = ((created_day > "2020-11-02") & (created_day < "2020-11-05"))) %>%
|
|
mutate(third_open = (created_day >= "2021-04-17")) %>%
|
|
mutate(open = (first_open | third_open))
|
|
|
|
early.data_plot <- early.day_counts %>%
|
|
mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
|
|
ggplot(aes(x = created_day, y=count)) +
|
|
geom_rect(data = (early.day_counts %>% filter(open)),
|
|
aes(xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf),
|
|
fill = "lightblue", alpha = 0.3) + # Adjust color and transparency as needed
|
|
geom_bar(stat="identity") +
|
|
facet_wrap(~ server, ncol=1, strip.position = "left") + #, scales="free_y") +
|
|
scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
|
|
scale_y_log10() +
|
|
labs(
|
|
title = "Open registration periods on mastodon.social (August 2020 - August 2021)",
|
|
x = "Account Created Date",
|
|
y = "Count"
|
|
) +
|
|
theme_bw_small_labels()
|
|
|
|
model_data <- early.day_counts %>%
|
|
mutate(count = log1p(count)) %>%
|
|
ungroup %>%
|
|
arrange(created_day) %>%
|
|
mutate(day = row_number())
|
|
|
|
fit <- model_data %>%
|
|
model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
|
|
|
|
early.table <- fit %>% tidy %>%
|
|
mutate(p.value = scales::pvalue(p.value)) %>%
|
|
pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
|
|
select(-c(.model)) %>%
|
|
select(term,
|
|
estimate_mastodon.online, p.value_mastodon.online,
|
|
estimate_mastodon.social, p.value_mastodon.social,
|
|
estimate_joinmastodon, p.value_joinmastodon,
|
|
estimate_other, p.value_other
|
|
) %>%
|
|
#select(term, starts_with("estimate"), starts_with("p.value")) #%>%
|
|
knitr::kable(
|
|
.,
|
|
col.names = c("Term", "mastodon.online", "", "mastodon.social", "", "joinmastodon", "", "other", ""),
|
|
digits = 4,
|
|
align = c("l", "r", "r", "r", "r", "r", "r", "r", "r")
|
|
)
|
|
|
|
early.data_plot
|
|
```
|
|
## Closure and Opening of MS (2022) {.tiny}
|
|
|
|
```{r}
|
|
early.table
|
|
```
|
|
|
|
## Closure and Opening of MS (Early 2023) {.tiny}
|
|
|
|
```{r}
|
|
#| fig-width: 9
|
|
|
|
library(jsonlite)
|
|
library(here)
|
|
library(tidyverse)
|
|
library(tsibble)
|
|
library(fable)
|
|
|
|
email.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain
|
|
|
|
email.day_counts <- accounts %>%
|
|
filter(created_at > "2022-07-01") %>%
|
|
filter(created_at < "2022-10-26") %>%
|
|
mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>%
|
|
mutate(server_code = ifelse(server %in% email.jm_servers, "joinmastodon", "other")) %>%
|
|
mutate(server = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>%
|
|
#mutate(server = server_code) %>%
|
|
#filter(server != "other") %>%
|
|
group_by(created_day, server) %>%
|
|
summarize(count = n(), .groups = "drop") %>%
|
|
as_tsibble(., key = server, index = created_day) %>%
|
|
fill_gaps(count = 0) %>%
|
|
mutate(open = ((created_day < "2022-08-13") |
|
|
(created_day > "2022-10-03")))
|
|
|
|
email.data_plot <- email.day_counts %>%
|
|
#filter(server != "other") %>%
|
|
mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>%
|
|
ggplot(aes(x = created_day, y = count)) +
|
|
geom_rect(
|
|
data = (email.day_counts %>% filter(open)),
|
|
aes(
|
|
xmin = created_day - 0.5,
|
|
xmax = created_day + 0.5,
|
|
ymin = 0,
|
|
ymax = Inf
|
|
),
|
|
fill = "lightblue",
|
|
alpha = 0.3
|
|
) + # Adjust color and transparency as needed
|
|
geom_bar(stat = "identity") +
|
|
facet_wrap( ~ server, ncol = 1, strip.position = "left") + #, scales="free_y") +
|
|
scale_x_date(expand = c(0, 0), date_labels = "%B %Y") +
|
|
labs(
|
|
title = "Closure of mastodon.social (2022)",
|
|
x = "Account Created Date",
|
|
y = "Count"
|
|
) +
|
|
theme_bw_small_labels()
|
|
|
|
email.data_plot
|
|
```
|
|
|
|
## Closure and Opening of MS (Early 2023) {.tiny}
|
|
|
|
```{r}
|
|
model_data <- email.day_counts %>%
|
|
mutate(count = log1p(count)) %>%
|
|
ungroup %>%
|
|
arrange(created_day) %>%
|
|
mutate(day = row_number())
|
|
|
|
fit <- model_data %>%
|
|
model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7)))
|
|
|
|
email.table <- fit %>% tidy %>%
|
|
mutate(p.value = scales::pvalue(p.value)) %>%
|
|
pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>%
|
|
select(-c(.model)) %>%
|
|
select(term,
|
|
estimate_mastodon.social, p.value_mastodon.social,
|
|
estimate_joinmastodon, p.value_joinmastodon,
|
|
estimate_other, p.value_other
|
|
) %>%
|
|
knitr::kable(
|
|
.,
|
|
col.names = c("Term", "mastodon.social", "", "joinmastodon", "", "other", ""),
|
|
digits = 4,
|
|
align = c("l", "r", "r", "r", "r", "r", "r")
|
|
)
|
|
|
|
email.table
|
|
```
|
|
|
|
## A Change in Strategy
|
|
|
|
Mastodon has shifted away from _discouraging_ newcomers from using mastodon.social to using the flagship server as the default.
|
|
|
|
. . .
|
|
|
|
Today, almost half of new Mastodon accounts join mastodon.social
|
|
|
|
<!--- ## Do some servers retain newcomers better than others? --->
|
|
|
|
## A Change in Strategy
|
|
|
|

|
|
|
|
## Moving Accounts on Mastodon
|
|
|
|
+ Accounts can move freely between Mastodon servers
|
|
+ Moved accounts retain their followers (but not their posts)
|
|
|
|
## Are people moving to larger or smaller servers? {.tiny}
|
|
|
|
```{r}
|
|
#| label: tbl-ergm
|
|
#| tbl-cap: ERGM model output
|
|
#| cache: true
|
|
load(file = here("data/scratch/ergm-model-early.rda"))
|
|
load(file = here("data/scratch/ergm-model-late.rda"))
|
|
#library(gt)
|
|
library(kableExtra)
|
|
library(modelsummary)
|
|
modelsummary(
|
|
list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late),
|
|
estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"),
|
|
statistic = NULL,
|
|
gof_omit = ".*",
|
|
coef_rename = c(
|
|
"sum" = "(Sum)",
|
|
"diff.sum0.h-t.accounts" = "Smaller server",
|
|
"nodeocov.sum.accounts" = "Server size (outgoing)",
|
|
"nodeifactor.sum.registrations.TRUE" = "Open registrations (incoming)",
|
|
"nodematch.sum.language" = "Languages match"
|
|
),
|
|
align="lrrrr",
|
|
stars = c('*' = .05, '**' = 0.01, '***' = .001),
|
|
output = "kableExtra") %>%
|
|
add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2))
|
|
```
|
|
|
|
## The Local Timeline: Mastodon's Secret Killer Feature
|
|
|
|
While discovery is challenging in decentralized online social networks, joining the right server can make it easier.
|
|
|
|
If you join an server focused on a particular topic or community of interest, you get a timeline about that topic without having to follow anyone.
|
|
|
|
## Challenges in Buildling Recommendation Systems on DOSNs {.small}
|
|
|
|
1. **Tensions around centralization**: a single service providing recommendations for all servers probably won't work.
|
|
1. **Local control**: system should be opt-in, server admins should be able to filter servers they accept data from.
|
|
1. **Computing power**: needs to be able to run on servers with limited resources.
|
|
|
|
## Concept: Use Hashtags
|
|
|
|
Advantages:
|
|
|
|
1. Already have their own table in the database.
|
|
2. Clear opt-in toward public visibility
|
|
|
|
## Design
|
|
|
|
For the most popular tags by their local users, each server reports:
|
|
|
|
1. A list of top tags
|
|
2. The number of accounts using each tag in the last 6 months
|
|
3. The number of accounts using any tag on the server.
|
|
|
|
. . .
|
|
|
|
Weigh the model using term frequency-inverse document frequency (TF-IDF)
|
|
|
|
## Challenge
|
|
|
|
How many servers do we need?
|
|
|
|
How many tags do they need to report?
|
|
|
|
## Baseline comparison
|
|
|
|
+ Data from all servers with over 100 accounts using hashtags.
|
|
+ Use cosine similarity to find pairwise similarity between all servers.
|
|
+ Compare to simulations with limits on the number of servers and number of tags reported.
|
|
|
|
Comparison metric: rank biased overlap (RBO).
|
|
|
|
## Overlap with Baseline in Various Simulations
|
|
|
|
```{r}
|
|
#| label: fig-simulations-rbo
|
|
#| fig-width: 10
|
|
simulations <- arrow::read_ipc_file(here("data/scratch/simulation_rbo.feather"))
|
|
|
|
simulations %>%
|
|
group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>%
|
|
mutate(ltags = as.integer(log2(tags))) %>%
|
|
ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) +
|
|
geom_boxplot() +
|
|
facet_wrap(~servers, nrow=1) +
|
|
scale_y_continuous(limits = c(0, 1)) +
|
|
labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") +
|
|
theme_minimal() + theme(legend.position = "none")
|
|
```
|
|
|
|
## Example Recommendation System
|
|
|
|
+ Use just servers from joinmastodon.org
|
|
+ Ask for preferences from a bag of common tags.
|
|
+ Suggest top servers according to similarity.
|
|
|
|
## User 1: education, science, academia
|
|
|
|
Top suggestions:
|
|
|
|
+ mathstodon.xyz
|
|
+ sciences.social
|
|
+ mastodon.education
|
|
+ hcommons.social
|
|
+ mas.to
|
|
|
|
## User 2: tech, linux, hacking
|
|
|
|
Top suggestions:
|
|
|
|
+ snabelen.no
|
|
+ social.anoxinon.de
|
|
+ peoplemaking.games
|
|
+ mastodon.gamedev.place
|
|
+ discuss.systems
|