219 lines
7.2 KiB
Plaintext
219 lines
7.2 KiB
Plaintext
|
|
```{r}
|
|
#| label: fig-active-accounts
|
|
#| eval: false
|
|
#library(betareg)
|
|
library(lme4)
|
|
activity <- arrow::read_feather(
|
|
"data/scratch/activity.feather",
|
|
col_select = c("server", "logins")
|
|
) %>%
|
|
arrange(desc(logins)) %>%
|
|
mutate(server_count = logins)
|
|
|
|
account_data <- inner_join(accounts, activity, by="server") %>%
|
|
mutate(active = active_time >= 45)
|
|
|
|
a_data <- account_data %>%
|
|
#mutate(active = active_time >= 45) %>%
|
|
group_by(server) %>%
|
|
summarize(active_prop = sum(active)/n(), active_count = sum(active), count=n()) %>%
|
|
inner_join(., activity, by="server")
|
|
|
|
a_model <- glmer(active ~ log1p(logins) + (1|server), data=account_data, family=binomial)
|
|
#betareg(active_prop ~ log10(count), data = a_data)
|
|
|
|
logins_seq <- seq(min(log1p(account_data$logins)), max(log1p(account_data$logins)), length.out = 100)
|
|
a_pred <- predict(
|
|
a_model,
|
|
newdata = data.frame(logins = logins_seq, server = factor(1)),
|
|
type = "response",
|
|
re.form = NA)
|
|
pred_data <- data.frame(logins = logins_seq, active_prop = a_pred)
|
|
|
|
a_data %>%
|
|
mutate(logins = log1p(logins)) %>%
|
|
ggplot(aes(y=active_prop, x=logins)) +
|
|
geom_point(alpha=0.1) +
|
|
# help here
|
|
#geom_line(aes(y = a_pred)) +
|
|
geom_line(data = pred_data, aes(x = logins, y = active_prop), color = "red") + # Use pred_data for line
|
|
labs(
|
|
y = "Active after 45 Days",
|
|
x = "Accounts"
|
|
) +
|
|
scale_x_continuous(labels = scales::comma) +
|
|
#scale_y_log10() +
|
|
theme_bw_small_labels()
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
library(fable)
|
|
#library(fable.binary)
|
|
library(tsibble)
|
|
library(lubridate)
|
|
|
|
ad_time <- account_data |>
|
|
mutate(created_at = yearweek(created_at)) |>
|
|
group_by(server, created_at) |>
|
|
summarize(count = n(), active = sum(active)) |>
|
|
as_tsibble(key="server", index=created_at)
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
fit <- ad_time |>
|
|
model(
|
|
logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
ad_time |>
|
|
filter(server == "mastodon.social") |>
|
|
sample_n(100) |>
|
|
autoplot(active)
|
|
```
|
|
|
|
|
|
```{r}
|
|
#| label: fig-account-activity-prop
|
|
#| fig-cap: "Account Activity Over Time"
|
|
#| fig-height: 4
|
|
#| eval: false
|
|
study_period <- 45
|
|
last_day <- "2024-01-15"
|
|
#formerly accounts_processed_recent
|
|
#server_counts <- arrow::read_feather(
|
|
# "data/scratch/accounts.feather",
|
|
# col_select=c("server", "username", "created_at", "bot")
|
|
# ) %>%
|
|
# filter(created_at <= "2023-03-01") %>%
|
|
# filter(!bot) %>%
|
|
# group_by(server) %>%
|
|
# summarize(server_count = n()) %>%
|
|
# arrange(desc(server_count)) %>%
|
|
# mutate(server_count_bin = floor(log10(server_count)))
|
|
|
|
metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
|
|
arrange(desc(user_count)) %>%
|
|
mutate(server_count = user_count) %>%
|
|
mutate(server_count_bin = floor(log10(server_count))) %>%
|
|
mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
|
|
mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
|
|
|
|
activity <- arrow::read_feather(
|
|
"data/scratch/activity.feather",
|
|
col_select = c("server", "logins")
|
|
) %>%
|
|
arrange(desc(logins)) %>%
|
|
mutate(server_count = logins) %>%
|
|
mutate(server_count_bin = floor(log10(server_count))) %>%
|
|
# Merge 4 and 5
|
|
mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
|
|
# Merge below 2
|
|
#mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
|
|
mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
|
|
|
|
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
|
|
|
a <- accounts %>%
|
|
filter(!has_moved) %>%
|
|
filter(locked == FALSE) %>%
|
|
anti_join(., moved_to, by=c("username"="moved_acct", "server"="moved_server")) %>%
|
|
filter(created_at >= "2022-08-14") %>%
|
|
filter(created_at < "2022-10-03") %>%
|
|
#filter(created_at >= "2023-10-15") %>%
|
|
#filter(created_at < "2023-11-15") %>%
|
|
inner_join(activity, by="server") %>%
|
|
filter(created_at < last_status_at) %>%
|
|
#mutate(large_server = server_count > 1000) %>%
|
|
mutate(active_time = as.integer(active_time)) %>%
|
|
mutate(active_time_weeks = active_time) %>%
|
|
mutate(status = ifelse(active, 0, 1)) %>%
|
|
mutate(jm = server %in% jm$domain) %>%
|
|
mutate(follows_someone = following_count == 0) %>%
|
|
mutate(has_a_follower = followers_count > 0)
|
|
#filter(server_count > 0)
|
|
|
|
|
|
survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin)
|
|
ggsurvfit() +
|
|
add_confidence_interval() +
|
|
scale_y_continuous(limits = c(0, 1)) +
|
|
labs(
|
|
y = "Overall survival probability",
|
|
x = "Time (days)",
|
|
) +
|
|
#scale_x_continuous(
|
|
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
|
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
|
#) +
|
|
theme_bw_small_labels() +
|
|
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
library(coxme)
|
|
|
|
sel_a <- a %>%
|
|
#filter(server != "mastodon.social") %>%
|
|
mutate(is_ms = server == "mastodon.social") %>%
|
|
filter(!is_ms) %>%
|
|
filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
|
|
#mutate(is_general = category=="general") %>%
|
|
rowwise() %>%
|
|
mutate(is_regional = "regional" %in% categories) %>%
|
|
mutate(is_general = ("general" %in% categories)) %>%
|
|
mutate(is_neither = !(is_regional | is_regional)) %>%
|
|
mutate(is_en = language == "en") %>%
|
|
mutate(n_categories = length(categories) - is_regional - is_general) %>%
|
|
mutate(many_categories = n_categories > 0) %>%
|
|
mutate(is_large = last_week_users >= 585) %>%
|
|
#mutate(follows_someone = followers_count > 0) %>% filter(server_count > 1) %>%
|
|
#filter(followers_count < 250) %>%
|
|
ungroup
|
|
|
|
sel_a <-
|
|
inner_join(sel_a, (sel_a %>% group_by(server) %>% summarize(cohort_size=n())), by="server")
|
|
#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE)
|
|
#cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE)
|
|
#coxme(Surv(active_time_weeks, status) ~ is_regional + many_categories + is_general*log10(server_count) + (1|server), data = sel_a, x=TRUE)
|
|
|
|
# coxme(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general*log10(cohort_size) + (1|server), data = sel_a, x=TRUE)
|
|
|
|
cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general:log1p(cohort_size), data = sel_a, x=TRUE) # log10(server_count)
|
|
cz <- cox.zph(cx)
|
|
#plot(cz)
|
|
cx
|
|
cz
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
options(rf.cores=2, mc.cores=2)
|
|
for_data <- sel_a %>%
|
|
filter(!is_general) %>%
|
|
mutate(rn=row_number())
|
|
|
|
set.seed(123)
|
|
|
|
data_test <- for_data %>% slice_sample(n = floor(0.4 * nrow(for_data)))
|
|
data_train <- for_data %>% slice(-pull(data_test,rn))
|
|
|
|
obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ server_count, data = data_train, ntree=100, forest=TRUE)
|
|
|
|
pred <- predict(obj, data_test)
|
|
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
|
|
|
|
|
|
obj <- rfsrc(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=50, forest=TRUE, importance=TRUE)
|
|
#predictions <- predict(obj, newdata = newData)$predicted
|
|
#plot(get.tree(obj, 1))
|
|
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
|
|
plot.subsample(reg.smp.o)
|
|
```
|