125 lines
4.4 KiB
Plaintext
125 lines
4.4 KiB
Plaintext
# tets
|
|
|
|
|
|
```{r}
|
|
#| label: fig-moved-accounts
|
|
#| fig-height: 4
|
|
#| eval: false
|
|
moved_accounts <- arrow::read_feather("data/scratch/moved_accounts.feather")
|
|
popular_servers <- arrow::read_feather("data/scratch/popular_servers.feather")
|
|
server_movement_data <- left_join(
|
|
(moved_accounts %>% group_by(server) %>% summarize(out_count = sum(count)) %>% select(server, out_count)),
|
|
(moved_accounts %>% group_by(moved_server) %>% summarize(in_count = sum(count)) %>% select(moved_server, in_count) %>% rename(server=moved_server)),
|
|
by="server"
|
|
) %>% replace_na(list(out_count = 0, in_count = 0)) %>%
|
|
mutate(diff = in_count - out_count) %>%
|
|
arrange(diff) %>%
|
|
left_join(., popular_servers, by="server") %>%
|
|
rename(user_count = count) %>% arrange(desc(user_count))
|
|
server_movement_data %>%
|
|
ggplot(aes(x=user_count, y=diff)) +
|
|
geom_point() + scale_x_log10() + theme_bw_small_labels()
|
|
```
|
|
|
|
If there was no relationship, we would expect these jumps to be random with respect to server size.
|
|
|
|
```{r}
|
|
popular_servers <-
|
|
arrow::read_feather("data/scratch/popular_servers.feather")
|
|
moved_accounts <-
|
|
arrow::read_feather("data/scratch/moved_accounts.feather") %>%
|
|
# Remove loops
|
|
filter(server != moved_server)
|
|
activity <-
|
|
arrow::read_feather("data/scratch/activity.feather",
|
|
col_select = c("server", "logins")) %>%
|
|
arrange(desc(logins))
|
|
popular_and_large_servers <-
|
|
popular_servers %>% filter(count >= 1) %>%
|
|
mutate(count = log10(count))
|
|
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
|
ma <- moved_accounts %>%
|
|
filter(server %in% popular_and_large_servers$server) %>%
|
|
filter(moved_server %in% popular_and_large_servers$server)
|
|
# Construct network
|
|
edgeNet <- network(ma, matrix.type = "edgelist")
|
|
edgeNet %v% "user_count" <-
|
|
left_join((as_tibble(edgeNet %v% 'vertex.names') %>% rename(server = value)),
|
|
popular_and_large_servers,
|
|
by = "server") %>%
|
|
select(count) %>%
|
|
unlist()
|
|
edgeNet %v% "in_jm" <-
|
|
as_tibble(edgeNet %v% 'vertex.names') %>%
|
|
mutate(in_jm = value %in% jm$domain) %>%
|
|
select(in_jm) %>% unlist()
|
|
```
|
|
|
|
We construct an exponential family random graph model (ERGM) where nodes represent servers and weighted directed edges represent the number of accounts that moved between servers.
|
|
|
|
$$
|
|
\begin{aligned}
|
|
\text{Sum}_{i,j} = & \beta_1 (log10(\text{user count}_j) - log10(\text{user count}_i)) + \\
|
|
& \beta_2 \\
|
|
& \beta_3 \\
|
|
& \beta_4 \\
|
|
\end{aligned}
|
|
$$
|
|
|
|
```{r}
|
|
#| label: ergm-model
|
|
#| cache: true
|
|
m1 <-
|
|
ergm(
|
|
edgeNet ~ sum +
|
|
diff("user_count", pow = 1, form = "sum") +
|
|
nodecov("user_count", form = "sum") +
|
|
nodematch("in_jm", diff = TRUE, form = "sum"),
|
|
response = "count",
|
|
reference = ~ Binomial(3),
|
|
control=control.ergm(parallel=4, parallel.type="PSOCK")
|
|
)
|
|
|
|
save(m1, file = "data/scratch/ergm-model.rda")
|
|
```
|
|
|
|
|
|
```{r}
|
|
#| label: tag-ergm-result
|
|
#| output: asis
|
|
ergm_model <- load("data/scratch/ergm-model.rda")
|
|
|
|
modelsummary(
|
|
m1,
|
|
escape = FALSE,
|
|
coef_rename = c(
|
|
"sum" = "$\\beta_0$ Intercept",
|
|
"diff.sum.t-h.user_count" = "$\\beta_1$ User Count Difference",
|
|
"nodecov.sum.user_count" = "$\\beta_2$ User Count (Node Covariate)",
|
|
"nodematch.sum.in_jm.TRUE" = "$\\beta_3$ In JoinMastodon (Both True)",
|
|
"nodematch.sum.in_jm.FALSE" = "$\\beta_4$ In JoinMastodon (Both False)"
|
|
),
|
|
)
|
|
```
|
|
|
|
We find a strong preference for accounts to move from large servers to smaller servers.
|
|
|
|
```{python}
|
|
#| eval: false
|
|
#| include: false
|
|
import random
|
|
|
|
def simulate_account_moves(origin: str, servers: dict, n: int):
|
|
server_list = list(set(servers.keys()) - {origin})
|
|
weights = [servers[x] for x in server_list]
|
|
return pl.DataFrame({
|
|
"simulation": list(range(n)),
|
|
"server": [origin] * n,
|
|
"moved_server": random.choices(server_list, weights=weights, k=n)
|
|
})
|
|
|
|
simulations = pl.concat([simulate_account_moves(row["server"], {x["server"]: x["count"] for x in popular_servers.iter_rows(named=True)}, 1000) for row in maccounts.iter_rows(named=True)])
|
|
m_counts = maccounts.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
|
|
sim_counts = simulations.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
|
|
```
|