junior-sheer/notebooks/archive/_server_survival.qmd
2024-02-27 14:08:11 -06:00

219 lines
7.2 KiB
Plaintext

```{r}
#| label: fig-active-accounts
#| eval: false
#library(betareg)
library(lme4)
activity <- arrow::read_feather(
"data/scratch/activity.feather",
col_select = c("server", "logins")
) %>%
arrange(desc(logins)) %>%
mutate(server_count = logins)
account_data <- inner_join(accounts, activity, by="server") %>%
mutate(active = active_time >= 45)
a_data <- account_data %>%
#mutate(active = active_time >= 45) %>%
group_by(server) %>%
summarize(active_prop = sum(active)/n(), active_count = sum(active), count=n()) %>%
inner_join(., activity, by="server")
a_model <- glmer(active ~ log1p(logins) + (1|server), data=account_data, family=binomial)
#betareg(active_prop ~ log10(count), data = a_data)
logins_seq <- seq(min(log1p(account_data$logins)), max(log1p(account_data$logins)), length.out = 100)
a_pred <- predict(
a_model,
newdata = data.frame(logins = logins_seq, server = factor(1)),
type = "response",
re.form = NA)
pred_data <- data.frame(logins = logins_seq, active_prop = a_pred)
a_data %>%
mutate(logins = log1p(logins)) %>%
ggplot(aes(y=active_prop, x=logins)) +
geom_point(alpha=0.1) +
# help here
#geom_line(aes(y = a_pred)) +
geom_line(data = pred_data, aes(x = logins, y = active_prop), color = "red") + # Use pred_data for line
labs(
y = "Active after 45 Days",
x = "Accounts"
) +
scale_x_continuous(labels = scales::comma) +
#scale_y_log10() +
theme_bw_small_labels()
```
```{r}
#| eval: false
library(fable)
#library(fable.binary)
library(tsibble)
library(lubridate)
ad_time <- account_data |>
mutate(created_at = yearweek(created_at)) |>
group_by(server, created_at) |>
summarize(count = n(), active = sum(active)) |>
as_tsibble(key="server", index=created_at)
```
```{r}
#| eval: false
fit <- ad_time |>
model(
logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
)
```
```{r}
#| eval: false
ad_time |>
filter(server == "mastodon.social") |>
sample_n(100) |>
autoplot(active)
```
```{r}
#| label: fig-account-activity-prop
#| fig-cap: "Account Activity Over Time"
#| fig-height: 4
#| eval: false
study_period <- 45
last_day <- "2024-01-15"
#formerly accounts_processed_recent
#server_counts <- arrow::read_feather(
# "data/scratch/accounts.feather",
# col_select=c("server", "username", "created_at", "bot")
# ) %>%
# filter(created_at <= "2023-03-01") %>%
# filter(!bot) %>%
# group_by(server) %>%
# summarize(server_count = n()) %>%
# arrange(desc(server_count)) %>%
# mutate(server_count_bin = floor(log10(server_count)))
metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
arrange(desc(user_count)) %>%
mutate(server_count = user_count) %>%
mutate(server_count_bin = floor(log10(server_count))) %>%
mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
activity <- arrow::read_feather(
"data/scratch/activity.feather",
col_select = c("server", "logins")
) %>%
arrange(desc(logins)) %>%
mutate(server_count = logins) %>%
mutate(server_count_bin = floor(log10(server_count))) %>%
# Merge 4 and 5
mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
# Merge below 2
#mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
a <- accounts %>%
filter(!has_moved) %>%
filter(locked == FALSE) %>%
anti_join(., moved_to, by=c("username"="moved_acct", "server"="moved_server")) %>%
filter(created_at >= "2022-08-14") %>%
filter(created_at < "2022-10-03") %>%
#filter(created_at >= "2023-10-15") %>%
#filter(created_at < "2023-11-15") %>%
inner_join(activity, by="server") %>%
filter(created_at < last_status_at) %>%
#mutate(large_server = server_count > 1000) %>%
mutate(active_time = as.integer(active_time)) %>%
mutate(active_time_weeks = active_time) %>%
mutate(status = ifelse(active, 0, 1)) %>%
mutate(jm = server %in% jm$domain) %>%
mutate(follows_someone = following_count == 0) %>%
mutate(has_a_follower = followers_count > 0)
#filter(server_count > 0)
survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin)
ggsurvfit() +
add_confidence_interval() +
scale_y_continuous(limits = c(0, 1)) +
labs(
y = "Overall survival probability",
x = "Time (days)",
) +
#scale_x_continuous(
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
#) +
theme_bw_small_labels() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
```
```{r}
#| eval: false
library(coxme)
sel_a <- a %>%
#filter(server != "mastodon.social") %>%
mutate(is_ms = server == "mastodon.social") %>%
filter(!is_ms) %>%
filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
#mutate(is_general = category=="general") %>%
rowwise() %>%
mutate(is_regional = "regional" %in% categories) %>%
mutate(is_general = ("general" %in% categories)) %>%
mutate(is_neither = !(is_regional | is_regional)) %>%
mutate(is_en = language == "en") %>%
mutate(n_categories = length(categories) - is_regional - is_general) %>%
mutate(many_categories = n_categories > 0) %>%
mutate(is_large = last_week_users >= 585) %>%
#mutate(follows_someone = followers_count > 0) %>% filter(server_count > 1) %>%
#filter(followers_count < 250) %>%
ungroup
sel_a <-
inner_join(sel_a, (sel_a %>% group_by(server) %>% summarize(cohort_size=n())), by="server")
#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE)
#cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE)
#coxme(Surv(active_time_weeks, status) ~ is_regional + many_categories + is_general*log10(server_count) + (1|server), data = sel_a, x=TRUE)
# coxme(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general*log10(cohort_size) + (1|server), data = sel_a, x=TRUE)
cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general:log1p(cohort_size), data = sel_a, x=TRUE) # log10(server_count)
cz <- cox.zph(cx)
#plot(cz)
cx
cz
```
```{r}
#| eval: false
options(rf.cores=2, mc.cores=2)
for_data <- sel_a %>%
filter(!is_general) %>%
mutate(rn=row_number())
set.seed(123)
data_test <- for_data %>% slice_sample(n = floor(0.4 * nrow(for_data)))
data_train <- for_data %>% slice(-pull(data_test,rn))
obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ server_count, data = data_train, ntree=100, forest=TRUE)
pred <- predict(obj, data_test)
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
obj <- rfsrc(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=50, forest=TRUE, importance=TRUE)
#predictions <- predict(obj, newdata = newData)$predicted
#plot(get.tree(obj, 1))
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
plot.subsample(reg.smp.o)
```