junior-sheer/codebase/R/survival.R
Carl Colglazier 60023b07d1 Refactor scripts and code.
Split manuscripts into their own directories / projects.
2025-05-26 20:08:57 -05:00

99 lines
3.3 KiB
R

library(here)
library(survival)
library(tidyverse)
library(ggsurvfit)
library(coxme)
library(jsonlite)
source(here("codebase/R/helpers.R"))
options(arrow.skip_nul = TRUE)
active_period <- 91
a <- load_accounts() %>%
filter(!has_moved) %>%
filter(locked == FALSE) %>%
anti_join(., arrow::read_feather(here("data/scratch/individual_moved_accounts.feather")), by=c("username"="moved_acct", "server"="moved_server")) %>%
inner_join(arrow::read_feather(here("data/scratch/metadata.feather")), by="server") %>%
filter(created_at > "2023-04-30") %>%
filter(created_at <= "2023-06-30") %>%
filter(created_at < last_status_at) %>%
mutate(jm = server %in% as_tibble(jsonlite::fromJSON(here("data/joinmastodon-2023-08-25.json")))$domain) %>%
mutate(active_time = difftime(last_status_at, created_at, units="days")) %>%
mutate(active_time_censored = ifelse(active_time > active_period, active_period, active_time)) %>%
#mutate(active_time = difftime(last_status_at, created_at, units="days")) %>%
#0=alive, 1=dead.
mutate(status = ifelse(active_time > active_period, 0, 1))# %>% filter(followers_count > 0) %>% filter(following_count > 0)
a %>% select(status, created_at, last_status_at, active_time_censored)
activity <- arrow::read_feather(
here("data/scratch/activity.feather"),
col_select = c("server", "logins")
) %>%
arrange(desc(logins))
server_summary <- a %>%
group_by(server) %>%
summarize(cohort_size = n(), .groups = "drop")
general_servers <- c(
"mastodon.social",
"mstdn.social",
"mastodon.world",
"mas.to",
"mastodon.online",
"mastodonapp.uk",
"universeodon.com",
"masto.ai",
"c.im",
"social.vivaldi.net",
"mstdn.party",
"ohai.social"
) # > 100 cohort size + strength > .75
server_centrality <- arrow::read_ipc_file(here("data/scratch/server_centrality.feather")) %>%
rename(generality = strength)
sel_a <- a %>%
mutate(is_ms = server == "mastodon.social") %>%
mutate(general = server %in% general_servers) %>%
ungroup() %>%
inner_join(server_summary, by = "server") %>%
left_join(server_centrality, by = "server") %>%
mutate(small_server = user_count <= 100) %>%
mutate(large_server = user_count >= 1000) %>%
#filter(jm) %>%
mutate(group = as.integer(jm) + as.integer(general)) %>%
#mutate(huge_server = user_count >= 3162.28) %>%
#filter(statuses_count >= 10) %>%
filter(!noindex)# %>% filter(cohort_size >= 2) #%>% filter(user_count > 100)
cx <- sel_a %>% coxph(Surv(active_time_censored, status) ~ general, data = ., x=TRUE, robust = T, cluster=server)
cxme <- sel_a %>% coxme(Surv(active_time_censored, status) ~ factor(group) + small_server + (1|server), data = .)
cz <- cox.zph(cx)
plot_km <- sel_a %>%
mutate(id = paste(username, server, sep = "@")) %>%
survfit2(
Surv(active_time_censored, status) ~ general,
data = ., id = id,
cluster = server,
robust = TRUE
) %>%
ggsurvfit() +
add_confidence_interval() +
labs(
y = "Overall survival probability",
x = "Time (days)",
) +
scale_fill_discrete(name = "Group", labels = c("Other JoinMastodon", "General")) +
scale_color_discrete(name = "Group", labels = c("Other JoinMastodon", "General")) +
theme_bw_small_labels() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="bottom")
# logistic regression on generality
logit <- sel_a %>%
glm(active ~ generality, data = ., family = "binomial")