junior-sheer/notebooks/archive/_moved_old.qmd
2024-02-27 14:08:11 -06:00

125 lines
4.4 KiB
Plaintext

# tets
```{r}
#| label: fig-moved-accounts
#| fig-height: 4
#| eval: false
moved_accounts <- arrow::read_feather("data/scratch/moved_accounts.feather")
popular_servers <- arrow::read_feather("data/scratch/popular_servers.feather")
server_movement_data <- left_join(
(moved_accounts %>% group_by(server) %>% summarize(out_count = sum(count)) %>% select(server, out_count)),
(moved_accounts %>% group_by(moved_server) %>% summarize(in_count = sum(count)) %>% select(moved_server, in_count) %>% rename(server=moved_server)),
by="server"
) %>% replace_na(list(out_count = 0, in_count = 0)) %>%
mutate(diff = in_count - out_count) %>%
arrange(diff) %>%
left_join(., popular_servers, by="server") %>%
rename(user_count = count) %>% arrange(desc(user_count))
server_movement_data %>%
ggplot(aes(x=user_count, y=diff)) +
geom_point() + scale_x_log10() + theme_bw_small_labels()
```
If there was no relationship, we would expect these jumps to be random with respect to server size.
```{r}
popular_servers <-
arrow::read_feather("data/scratch/popular_servers.feather")
moved_accounts <-
arrow::read_feather("data/scratch/moved_accounts.feather") %>%
# Remove loops
filter(server != moved_server)
activity <-
arrow::read_feather("data/scratch/activity.feather",
col_select = c("server", "logins")) %>%
arrange(desc(logins))
popular_and_large_servers <-
popular_servers %>% filter(count >= 1) %>%
mutate(count = log10(count))
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
ma <- moved_accounts %>%
filter(server %in% popular_and_large_servers$server) %>%
filter(moved_server %in% popular_and_large_servers$server)
# Construct network
edgeNet <- network(ma, matrix.type = "edgelist")
edgeNet %v% "user_count" <-
left_join((as_tibble(edgeNet %v% 'vertex.names') %>% rename(server = value)),
popular_and_large_servers,
by = "server") %>%
select(count) %>%
unlist()
edgeNet %v% "in_jm" <-
as_tibble(edgeNet %v% 'vertex.names') %>%
mutate(in_jm = value %in% jm$domain) %>%
select(in_jm) %>% unlist()
```
We construct an exponential family random graph model (ERGM) where nodes represent servers and weighted directed edges represent the number of accounts that moved between servers.
$$
\begin{aligned}
\text{Sum}_{i,j} = & \beta_1 (log10(\text{user count}_j) - log10(\text{user count}_i)) + \\
& \beta_2 \\
& \beta_3 \\
& \beta_4 \\
\end{aligned}
$$
```{r}
#| label: ergm-model
#| cache: true
m1 <-
ergm(
edgeNet ~ sum +
diff("user_count", pow = 1, form = "sum") +
nodecov("user_count", form = "sum") +
nodematch("in_jm", diff = TRUE, form = "sum"),
response = "count",
reference = ~ Binomial(3),
control=control.ergm(parallel=4, parallel.type="PSOCK")
)
save(m1, file = "data/scratch/ergm-model.rda")
```
```{r}
#| label: tag-ergm-result
#| output: asis
ergm_model <- load("data/scratch/ergm-model.rda")
modelsummary(
m1,
escape = FALSE,
coef_rename = c(
"sum" = "$\\beta_0$ Intercept",
"diff.sum.t-h.user_count" = "$\\beta_1$ User Count Difference",
"nodecov.sum.user_count" = "$\\beta_2$ User Count (Node Covariate)",
"nodematch.sum.in_jm.TRUE" = "$\\beta_3$ In JoinMastodon (Both True)",
"nodematch.sum.in_jm.FALSE" = "$\\beta_4$ In JoinMastodon (Both False)"
),
)
```
We find a strong preference for accounts to move from large servers to smaller servers.
```{python}
#| eval: false
#| include: false
import random
def simulate_account_moves(origin: str, servers: dict, n: int):
server_list = list(set(servers.keys()) - {origin})
weights = [servers[x] for x in server_list]
return pl.DataFrame({
"simulation": list(range(n)),
"server": [origin] * n,
"moved_server": random.choices(server_list, weights=weights, k=n)
})
simulations = pl.concat([simulate_account_moves(row["server"], {x["server"]: x["count"] for x in popular_servers.iter_rows(named=True)}, 1000) for row in maccounts.iter_rows(named=True)])
m_counts = maccounts.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
sim_counts = simulations.join(popular_servers, how="inner", on="server").rename({"count": "origin_count"}).join(popular_servers.rename({"server": "moved_server"}), how="inner", on="moved_server").rename({"count": "target_count"})
```