```{r} #| label: fig-active-accounts #| eval: false #library(betareg) library(lme4) activity <- arrow::read_feather( "data/scratch/activity.feather", col_select = c("server", "logins") ) %>% arrange(desc(logins)) %>% mutate(server_count = logins) account_data <- inner_join(accounts, activity, by="server") %>% mutate(active = active_time >= 45) a_data <- account_data %>% #mutate(active = active_time >= 45) %>% group_by(server) %>% summarize(active_prop = sum(active)/n(), active_count = sum(active), count=n()) %>% inner_join(., activity, by="server") a_model <- glmer(active ~ log1p(logins) + (1|server), data=account_data, family=binomial) #betareg(active_prop ~ log10(count), data = a_data) logins_seq <- seq(min(log1p(account_data$logins)), max(log1p(account_data$logins)), length.out = 100) a_pred <- predict( a_model, newdata = data.frame(logins = logins_seq, server = factor(1)), type = "response", re.form = NA) pred_data <- data.frame(logins = logins_seq, active_prop = a_pred) a_data %>% mutate(logins = log1p(logins)) %>% ggplot(aes(y=active_prop, x=logins)) + geom_point(alpha=0.1) + # help here #geom_line(aes(y = a_pred)) + geom_line(data = pred_data, aes(x = logins, y = active_prop), color = "red") + # Use pred_data for line labs( y = "Active after 45 Days", x = "Accounts" ) + scale_x_continuous(labels = scales::comma) + #scale_y_log10() + theme_bw_small_labels() ``` ```{r} #| eval: false library(fable) #library(fable.binary) library(tsibble) library(lubridate) ad_time <- account_data |> mutate(created_at = yearweek(created_at)) |> group_by(server, created_at) |> summarize(count = n(), active = sum(active)) |> as_tsibble(key="server", index=created_at) ``` ```{r} #| eval: false fit <- ad_time |> model( logistic = LOGISTIC(active ~ fourier(K = 5, period = "year")) ) ``` ```{r} #| eval: false ad_time |> filter(server == "mastodon.social") |> sample_n(100) |> autoplot(active) ``` ```{r} #| label: fig-account-activity-prop #| fig-cap: "Account Activity Over Time" #| fig-height: 4 #| eval: false study_period <- 45 last_day <- "2024-01-15" #formerly accounts_processed_recent #server_counts <- arrow::read_feather( # "data/scratch/accounts.feather", # col_select=c("server", "username", "created_at", "bot") # ) %>% # filter(created_at <= "2023-03-01") %>% # filter(!bot) %>% # group_by(server) %>% # summarize(server_count = n()) %>% # arrange(desc(server_count)) %>% # mutate(server_count_bin = floor(log10(server_count))) metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>% arrange(desc(user_count)) %>% mutate(server_count = user_count) %>% mutate(server_count_bin = floor(log10(server_count))) %>% mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>% mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin)) activity <- arrow::read_feather( "data/scratch/activity.feather", col_select = c("server", "logins") ) %>% arrange(desc(logins)) %>% mutate(server_count = logins) %>% mutate(server_count_bin = floor(log10(server_count))) %>% # Merge 4 and 5 mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>% # Merge below 2 #mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>% mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin)) jm <- arrow::read_feather("data/scratch/joinmastodon.feather") a <- accounts %>% filter(!has_moved) %>% filter(locked == FALSE) %>% anti_join(., moved_to, by=c("username"="moved_acct", "server"="moved_server")) %>% filter(created_at >= "2022-08-14") %>% filter(created_at < "2022-10-03") %>% #filter(created_at >= "2023-10-15") %>% #filter(created_at < "2023-11-15") %>% inner_join(activity, by="server") %>% filter(created_at < last_status_at) %>% #mutate(large_server = server_count > 1000) %>% mutate(active_time = as.integer(active_time)) %>% mutate(active_time_weeks = active_time) %>% mutate(status = ifelse(active, 0, 1)) %>% mutate(jm = server %in% jm$domain) %>% mutate(follows_someone = following_count == 0) %>% mutate(has_a_follower = followers_count > 0) #filter(server_count > 0) survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin) ggsurvfit() + add_confidence_interval() + scale_y_continuous(limits = c(0, 1)) + labs( y = "Overall survival probability", x = "Time (days)", ) + #scale_x_continuous( # breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), # labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4) #) + theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ``` ```{r} #| eval: false library(coxme) sel_a <- a %>% #filter(server != "mastodon.social") %>% mutate(is_ms = server == "mastodon.social") %>% filter(!is_ms) %>% filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>% #mutate(is_general = category=="general") %>% rowwise() %>% mutate(is_regional = "regional" %in% categories) %>% mutate(is_general = ("general" %in% categories)) %>% mutate(is_neither = !(is_regional | is_regional)) %>% mutate(is_en = language == "en") %>% mutate(n_categories = length(categories) - is_regional - is_general) %>% mutate(many_categories = n_categories > 0) %>% mutate(is_large = last_week_users >= 585) %>% #mutate(follows_someone = followers_count > 0) %>% filter(server_count > 1) %>% #filter(followers_count < 250) %>% ungroup sel_a <- inner_join(sel_a, (sel_a %>% group_by(server) %>% summarize(cohort_size=n())), by="server") #cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE) #cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE) #coxme(Surv(active_time_weeks, status) ~ is_regional + many_categories + is_general*log10(server_count) + (1|server), data = sel_a, x=TRUE) # coxme(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general*log10(cohort_size) + (1|server), data = sel_a, x=TRUE) cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general + is_regional + is_general:log1p(cohort_size), data = sel_a, x=TRUE) # log10(server_count) cz <- cox.zph(cx) #plot(cz) cx cz ``` ```{r} #| eval: false options(rf.cores=2, mc.cores=2) for_data <- sel_a %>% filter(!is_general) %>% mutate(rn=row_number()) set.seed(123) data_test <- for_data %>% slice_sample(n = floor(0.4 * nrow(for_data))) data_train <- for_data %>% slice(-pull(data_test,rn)) obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ server_count, data = data_train, ntree=100, forest=TRUE) pred <- predict(obj, data_test) reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5) obj <- rfsrc(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=50, forest=TRUE, importance=TRUE) #predictions <- predict(obj, newdata = newData)$predicted #plot(get.tree(obj, 1)) reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5) plot.subsample(reg.smp.o) ```