--- title: "Onboarding the Fediverse" subtitle: "Building community discovery in decentralized online social networks" author: "Carl Colglazier" format: revealjs: theme: presentation.scss keep-md: true knitr: opts_chunk: dev: "ragg_png" retina: 1 dpi: 200 execute: freeze: auto cache: true echo: false fig-width: 5 fig-height: 6 --- ## Growth on the Fediverse ```{r} #| label: fig-account-timeline #| fig-height: 3 #| fig-width: 6.75 library(arrow) library(tidyverse) library(lubridate) library(scales) library(here) source(here("code/helpers.R")) jm <- arrow::read_feather(here("data/scratch/joinmastodon.feather")) moved_to <- arrow::read_feather(here("data/scratch/individual_moved_accounts.feather")) accounts_unfilt <- arrow::read_feather( here("data/scratch/all_accounts.feather"), col_select=c( "server", "username", "created_at", "last_status_at", "statuses_count", "has_moved", "bot", "suspended", "following_count", "followers_count", "locked", "noindex", "group", "discoverable" )) accounts <- accounts_unfilt %>% filter(!bot) %>% # TODO: what's going on here? filter(!is.na(last_status_at)) %>% mutate(suspended = replace_na(suspended, FALSE)) %>% # sanity check filter(created_at >= "2020-10-01") %>% filter(created_at < "2024-01-01") %>% # We don't want accounts that were created and then immediately stopped being active filter(statuses_count >= 1) %>% filter(last_status_at >= created_at) %>% mutate(active = last_status_at >= "2024-01-01") %>% mutate(last_status_at = ifelse(active, lubridate::ymd_hms("2024-01-01 00:00:00", tz = "UTC"), last_status_at)) %>% mutate(active_time = difftime(last_status_at, created_at, units="days")) #%>% #filter(!has_moved) acc_data <- accounts %>% #filter(!has_moved) %>% mutate(created_month = format(created_at, "%Y-%m")) %>% mutate(created_week = floor_date(created_at, unit = "week")) %>% mutate(active_now = active) %>% mutate(active = active_time >= 45) %>% mutate("Is mastodon.social" = server == "mastodon.social") %>% mutate(jm = server %in% jm$domain) %>% group_by(created_week) %>% summarize( `JoinMastodon Server` = sum(jm) / n(), `Is mastodon.social` = sum(`Is mastodon.social`)/n(), Suspended = sum(suspended)/n(), Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)), active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)), Moved=sum(has_moved)/n(), count=n()) %>% pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended" p1 <- acc_data %>% ggplot(aes(x=as.Date(created_week), group=1)) + geom_line(aes(y=value, group=Measure, color=Measure)) + geom_point(aes(y=value, color=Measure), size=0.7) + scale_y_continuous(limits = c(0, 1.0)) + labs(y="Proportion") + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") + theme_bw_small_labels() + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) p2 <- acc_data %>% distinct(created_week, count) %>% ggplot(aes(x=as.Date(created_week), y=count)) + geom_bar(stat="identity", fill="black") + geom_vline( aes(xintercept = as.numeric(as.Date("2022-10-27"))), linetype="dashed", color = "black") + geom_vline( aes(xintercept = as.numeric(as.Date("2022-04-14"))), linetype="dashed", color = "black") + # https://twitter.com/elonmusk/status/1675187969420828672 geom_vline( aes(xintercept = as.numeric(as.Date("2022-12-15"))), linetype="dashed", color = "black") + geom_vline( aes(xintercept = as.numeric(as.Date("2023-07-01"))), linetype="dashed", color = "black") + #scale_y_continuous(limits = c(0, max(acc_data$count) + 100000)) + scale_y_continuous(labels = scales::comma) + labs(y="Count", x="Created Week") + theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_date(labels=date_format("%Y-%U"), breaks = "8 week") library(patchwork) p1 + p2 + plot_layout(ncol = 1) ``` ## The Million Account Elephant in the Room ::::: {.columns} ::: {.column width="40%"} ![](images/mastodon-social-signups-2020-11-01.png) ::: :::: {.column width="60%"} ::: {.smaller} Mastodon.social (MS), the flagship server from the Mastodon developers, has always been the largest Mastodon server. The server has been closed to new open registrations many times throughout the years. ::: :::: ::::: ## Closure and Opening of MS (2022) {.tiny} ```{r} #| fig-width: 9 library(jsonlite) library(here) library(tidyverse) library(tsibble) library(fable) server_list <- c( "mastodon.social", "mastodon.online" ) early.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2020-09-18.json")))$domain early.day_counts <- accounts %>% filter(created_at < "2021-09-01") %>% mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>% mutate(server_code = ifelse(server %in% early.jm_servers, "joinmastodon", "other")) %>% mutate(server_code = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>% mutate(server = ifelse(server == "mastodon.online", "mastodon.online", server_code)) %>% group_by(created_day, server) %>% summarize(count = n(), .groups = "drop") %>% as_tsibble(., key=server, index=created_day) %>% fill_gaps(count=0) %>% mutate(first_open = ((created_day >= "2020-09-18") & (created_day < "2020-11-01"))) %>% #mutate(second_open = ((created_day > "2020-11-02") & (created_day < "2020-11-05"))) %>% mutate(third_open = (created_day >= "2021-04-17")) %>% mutate(open = (first_open | third_open)) early.data_plot <- early.day_counts %>% mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>% ggplot(aes(x = created_day, y=count)) + geom_rect(data = (early.day_counts %>% filter(open)), aes(xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf), fill = "lightblue", alpha = 0.3) + # Adjust color and transparency as needed geom_bar(stat="identity") + facet_wrap(~ server, ncol=1, strip.position = "left") + #, scales="free_y") + scale_x_date(expand = c(0, 0), date_labels = "%B %Y") + scale_y_log10() + labs( title = "Open registration periods on mastodon.social (August 2020 - August 2021)", x = "Account Created Date", y = "Count" ) + theme_bw_small_labels() model_data <- early.day_counts %>% mutate(count = log1p(count)) %>% ungroup %>% arrange(created_day) %>% mutate(day = row_number()) fit <- model_data %>% model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7))) early.table <- fit %>% tidy %>% mutate(p.value = scales::pvalue(p.value)) %>% pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>% select(-c(.model)) %>% select(term, estimate_mastodon.online, p.value_mastodon.online, estimate_mastodon.social, p.value_mastodon.social, estimate_joinmastodon, p.value_joinmastodon, estimate_other, p.value_other ) %>% #select(term, starts_with("estimate"), starts_with("p.value")) #%>% knitr::kable( ., col.names = c("Term", "mastodon.online", "", "mastodon.social", "", "joinmastodon", "", "other", ""), digits = 4, align = c("l", "r", "r", "r", "r", "r", "r", "r", "r") ) early.data_plot ``` ## Closure and Opening of MS (2022) {.tiny} ```{r} early.table ``` ## Closure and Opening of MS (Early 2023) {.tiny} ```{r} #| fig-width: 9 library(jsonlite) library(here) library(tidyverse) library(tsibble) library(fable) email.jm_servers <- as_tibble(fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain email.day_counts <- accounts %>% filter(created_at > "2022-07-01") %>% filter(created_at < "2022-10-26") %>% mutate(created_day = as.Date(floor_date(created_at, unit = "day"))) %>% mutate(server_code = ifelse(server %in% email.jm_servers, "joinmastodon", "other")) %>% mutate(server = ifelse(server == "mastodon.social", "mastodon.social", server_code)) %>% #mutate(server = server_code) %>% #filter(server != "other") %>% group_by(created_day, server) %>% summarize(count = n(), .groups = "drop") %>% as_tsibble(., key = server, index = created_day) %>% fill_gaps(count = 0) %>% mutate(open = ((created_day < "2022-08-13") | (created_day > "2022-10-03"))) email.data_plot <- email.day_counts %>% #filter(server != "other") %>% mutate(created_week = as.Date(floor_date(created_day, unit = "week"))) %>% ggplot(aes(x = created_day, y = count)) + geom_rect( data = (email.day_counts %>% filter(open)), aes( xmin = created_day - 0.5, xmax = created_day + 0.5, ymin = 0, ymax = Inf ), fill = "lightblue", alpha = 0.3 ) + # Adjust color and transparency as needed geom_bar(stat = "identity") + facet_wrap( ~ server, ncol = 1, strip.position = "left") + #, scales="free_y") + scale_x_date(expand = c(0, 0), date_labels = "%B %Y") + labs( title = "Closure of mastodon.social (2022)", x = "Account Created Date", y = "Count" ) + theme_bw_small_labels() email.data_plot ``` ## Closure and Opening of MS (Early 2023) {.tiny} ```{r} model_data <- email.day_counts %>% mutate(count = log1p(count)) %>% ungroup %>% arrange(created_day) %>% mutate(day = row_number()) fit <- model_data %>% model(arima = ARIMA(count ~ open + day + open:day + fourier(period=7, K=2) + pdq(2,0,0) + PDQ(0,0,0,period=7))) email.table <- fit %>% tidy %>% mutate(p.value = scales::pvalue(p.value)) %>% pivot_wider(names_from=server, values_from = c(estimate, std.error, statistic, p.value)) %>% select(-c(.model)) %>% select(term, estimate_mastodon.social, p.value_mastodon.social, estimate_joinmastodon, p.value_joinmastodon, estimate_other, p.value_other ) %>% knitr::kable( ., col.names = c("Term", "mastodon.social", "", "joinmastodon", "", "other", ""), digits = 4, align = c("l", "r", "r", "r", "r", "r", "r") ) email.table ``` ## A Change in Strategy Mastodon has shifted away from _discouraging_ newcomers from using mastodon.social to using the flagship server as the default. . . . Today, almost half of new Mastodon accounts join mastodon.social ## A Change in Strategy ![](images/joinmastodon-screenshot.png) ## Moving Accounts on Mastodon + Accounts can move freely between Mastodon servers + Moved accounts retain their followers (but not their posts) ## Are people moving to larger or smaller servers? {.tiny} ```{r} #| label: tbl-ergm #| tbl-cap: ERGM model output #| cache: true load(file = here("data/scratch/ergm-model-early.rda")) load(file = here("data/scratch/ergm-model-late.rda")) #library(gt) library(kableExtra) library(modelsummary) modelsummary( list("Coef." = model.early, "Std.Error" = model.early, "Coef." = model.late, "Std.Error" = model.late), estimate = c("{estimate}", "{stars}{std.error}", "{estimate}", "{stars}{std.error}"), statistic = NULL, gof_omit = ".*", coef_rename = c( "sum" = "(Sum)", "diff.sum0.h-t.accounts" = "Smaller server", "nodeocov.sum.accounts" = "Server size (outgoing)", "nodeifactor.sum.registrations.TRUE" = "Open registrations (incoming)", "nodematch.sum.language" = "Languages match" ), align="lrrrr", stars = c('*' = .05, '**' = 0.01, '***' = .001), output = "kableExtra") %>% add_header_above(c(" " = 1, "Model A" = 2, "Model B" = 2)) ``` ## The Local Timeline: Mastodon's Secret Killer Feature While discovery is challenging in decentralized online social networks, joining the right server can make it easier. If you join an server focused on a particular topic or community of interest, you get a timeline about that topic without having to follow anyone. ## Challenges in Buildling Recommendation Systems on DOSNs {.small} 1. **Tensions around centralization**: a single service providing recommendations for all servers probably won't work. 1. **Local control**: system should be opt-in, server admins should be able to filter servers they accept data from. 1. **Computing power**: needs to be able to run on servers with limited resources. ## Concept: Use Hashtags Advantages: 1. Already have their own table in the database. 2. Clear opt-in toward public visibility ## Design For the most popular tags by their local users, each server reports: 1. A list of top tags 2. The number of accounts using each tag in the last 6 months 3. The number of accounts using any tag on the server. . . . Weigh the model using term frequency-inverse document frequency (TF-IDF) ## Challenge How many servers do we need? How many tags do they need to report? ## Baseline comparison + Data from all servers with over 100 accounts using hashtags. + Use cosine similarity to find pairwise similarity between all servers. + Compare to simulations with limits on the number of servers and number of tags reported. Comparison metric: rank biased overlap (RBO). ## Overlap with Baseline in Various Simulations ```{r} #| label: fig-simulations-rbo #| fig-width: 10 simulations <- arrow::read_ipc_file(here("data/scratch/simulation_rbo.feather")) simulations %>% group_by(servers, tags, run) %>% summarize(rbo=mean(rbo), .groups="drop") %>% mutate(ltags = as.integer(log2(tags))) %>% ggplot(aes(x = factor(ltags), y = rbo, fill = factor(ltags))) + geom_boxplot() + facet_wrap(~servers, nrow=1) + scale_y_continuous(limits = c(0, 1)) + labs(x = "Tags (log2)", y = "RBO", title = "Rank Biased Overlap with Baseline Rankings by Number of Servers") + theme_minimal() + theme(legend.position = "none") ``` ## Example Recommendation System + Use just servers from joinmastodon.org + Ask for preferences from a bag of common tags. + Suggest top servers according to similarity. ## User 1: education, science, academia Top suggestions: + mathstodon.xyz + sciences.social + mastodon.education + hcommons.social + mas.to ## User 2: tech, linux, hacking Top suggestions: + snabelen.no + social.anoxinon.de + peoplemaking.games + mastodon.gamedev.place + discuss.systems