Iteration on random forest

This commit is contained in:
Carl Colglazier 2024-02-07 11:41:14 -05:00
parent 38692cba68
commit 6a83cc2c24

164
index.qmd
View File

@ -42,6 +42,8 @@ execute:
```{r}
#| label: r-setup
#| output: false
#| error: false
#| warning: false
library(reticulate)
library(tidyverse)
library(arrow)
@ -199,7 +201,7 @@ all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "ser
```{r}
#| eval: false
arrow::read_feather(
"data/scratch/accounts.feather",
"data/scratch/accounts_processed_profiles.feather",
col_select = c(
"server", "username", "created_at",
"last_status_at", "statuses_count",
@ -217,6 +219,7 @@ arrow::read_feather(
filter(statuses_count > 1) %>%
filter(!suspended) %>%
filter(!has_moved) %>%
filter(server == "mastodon.social") %>%
#filter(last_status_at >= created_at) %>%
mutate(created_month = format(created_at, "%Y-%m")) %>%
group_by(created_month) %>%
@ -230,19 +233,26 @@ arrow::read_feather(
```{r}
#| label: fig-account-timeline
#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, and a day when Twitter experienced an outage and started rate limiting accounts."
#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
#| fig-height: 3
#| fig-width: 6.75
accounts_unfilt <- arrow::read_feather("data/scratch/all_accounts.feather", col_select=c("server", "username", "created_at", "last_status_at", "statuses_count", "has_moved", "bot", "suspended", "host")) %>%
filter(server == host)
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
accounts_unfilt <- arrow::read_feather(
"data/scratch/all_accounts.feather",
col_select=c(
"server", "username", "created_at", "last_status_at",
"statuses_count", "has_moved", "bot", "suspended",
"following_count", "followers_count"
))
accounts <- accounts_unfilt %>%
filter(!bot) %>%
# TODO: what's going on here?
filter(!is.na(last_status_at)) %>%
mutate(suspended = replace_na(suspended, FALSE)) %>%
# sanity check
filter(created_at >= "2021-01-01") %>%
filter(created_at < "2023-08-01") %>%
filter(created_at >= "2020-10-01") %>%
filter(created_at < "2024-01-01") %>%
# We don't want accounts that were created and then immediately stopped being active
filter(statuses_count >= 1) %>%
filter(last_status_at >= created_at) %>%
@ -254,14 +264,20 @@ acc_data <- accounts %>%
#filter(!has_moved) %>%
mutate(created_month = format(created_at, "%Y-%m")) %>%
mutate(created_week = floor_date(created_at, unit = "week")) %>%
mutate(active_now = active) %>%
mutate(active = active_time >= 45) %>%
mutate("Is mastodon.social" = server == "mastodon.social") %>%
mutate(jm = server %in% jm$domain) %>%
group_by(created_week) %>%
summarize(
`JoinMastodon Server` = sum(jm) / n(),
`Is mastodon.social` = sum(`Is mastodon.social`)/n(),
Suspended = sum(suspended)/n(),
Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
Moved=sum(has_moved)/n(),
count=n()) %>%
pivot_longer(cols=c("Active", "Moved"), names_to="Measure", values_to="value") # "Suspended"
pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
theme_bw_small_labels <- function(base_size = 9) {
theme_bw(base_size = base_size) %+replace%
theme(
@ -274,6 +290,7 @@ theme_bw_small_labels <- function(base_size = 9) {
legend.text = element_text(size = base_size * 0.8)
)
}
p1 <- acc_data %>%
ggplot(aes(x=as.Date(created_week), group=1)) +
geom_line(aes(y=value, group=Measure, color=Measure)) +
@ -298,6 +315,9 @@ p2 <- acc_data %>%
aes(xintercept = as.numeric(as.Date("2022-04-14"))),
linetype="dashed", color = "black") +
# https://twitter.com/elonmusk/status/1675187969420828672
geom_vline(
aes(xintercept = as.numeric(as.Date("2022-12-15"))),
linetype="dashed", color = "black") +
geom_vline(
aes(xintercept = as.numeric(as.Date("2023-07-01"))),
linetype="dashed", color = "black") +
@ -320,6 +340,7 @@ p1 + p2 + plot_layout(ncol = 1)
```{r}
#| label: fig-active-accounts
#| eval: false
#library(betareg)
library(lme4)
activity <- arrow::read_feather(
@ -364,15 +385,23 @@ a_data %>%
#scale_y_log10() +
theme_bw_small_labels()
```
```{r}
#| eval: false
library(fable)
library(fable.binary)
#library(fable.binary)
library(tsibble)
library(lubridate)
ad_time <- account_data |>
as_tsibble(key=c("username", "server"), index=created_at)
mutate(created_at = yearweek(created_at)) |>
group_by(server, created_at) |>
summarize(count = n(), active = sum(active)) |>
as_tsibble(key="server", index=created_at)
```
```{r}
#| eval: false
fit <- ad_time |>
model(
logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
@ -380,7 +409,10 @@ fit <- ad_time |>
```
```{r}
#| eval: false
ad_time |>
filter(server == "mastodon.social") |>
sample_n(100) |>
autoplot(active)
```
@ -391,24 +423,25 @@ ad_time |>
#| fig-height: 4
#| eval: false
study_period <- 45
last_day <- "2024-01-15"
#formerly accounts_processed_recent
server_counts <- arrow::read_feather(
"data/scratch/accounts.feather",
col_select=c("server", "username", "created_at", "bot")
) %>%
filter(created_at <= "2023-03-01") %>%
filter(!bot) %>%
group_by(server) %>%
summarize(server_count = n()) %>%
arrange(desc(server_count)) %>%
mutate(server_count_bin = floor(log10(server_count)))
#server_counts <- arrow::read_feather(
# "data/scratch/accounts.feather",
# col_select=c("server", "username", "created_at", "bot")
# ) %>%
# filter(created_at <= "2023-03-01") %>%
# filter(!bot) %>%
# group_by(server) %>%
# summarize(server_count = n()) %>%
# arrange(desc(server_count)) %>%
# mutate(server_count_bin = floor(log10(server_count)))
metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
arrange(desc(user_count)) %>%
mutate(server_count = user_count) %>%
mutate(server_count_bin = floor(log10(server_count)))
mutate(server_count_bin = floor(log10(server_count))) %>%
mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
activity <- arrow::read_feather(
"data/scratch/activity.feather",
@ -418,23 +451,29 @@ activity <- arrow::read_feather(
mutate(server_count = logins) %>%
mutate(server_count_bin = floor(log10(server_count))) %>%
# Merge 4 and 5
mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin))# %>%
# Merge 2 and 3
#mutate(server_count_bin = ifelse(server_count_bin == 3, 2, server_count_bin))
#mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
# Merge below 2
#mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
a <- accounts %>%
filter(!has_moved) %>%
#filter(created_at >= "2023-06-01") %>%
#filter(created_at < "2023-08-01") %>%
filter(created_at >= "2023-10-15") %>%
filter(created_at < "2023-12-01") %>%
inner_join(activity, by="server") %>%
mutate(large_server = server_count > 1000) %>%
filter(created_at < last_status_at) %>%
#mutate(large_server = server_count > 1000) %>%
mutate(active_time = as.integer(active_time)) %>%
mutate(active_time_weeks = active_time) %>%
mutate(status = ifelse(active, 0, 1)) %>%
mutate(jm = server %in% jm$domain) %>%
filter(server_count > 0)
mutate(jm = server %in% jm$domain) #%>% filter(server_count > 0)
survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>%
survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin)
ggsurvfit() +
add_confidence_interval() +
scale_y_continuous(limits = c(0, 1)) +
@ -442,29 +481,74 @@ survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>%
y = "Overall survival probability",
x = "Time (days)",
) +
scale_x_continuous(
breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
) +
#scale_x_continuous(
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
#) +
theme_bw_small_labels() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
```
```{r}
a %>% filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
mutate(is_general = category=="general") %>%
mutate(is_en = language == "en") %>%
mutate(is_large = last_week_users >= 585) %>% #filter(following_count < 10) %>%
survfit2(Surv(active_time_weeks, status) ~ is_general + is_large, data = .) %>% # strata(server_count_bin)
ggsurvfit(linetype_aes=TRUE, type = "survival") +
add_confidence_interval() +
scale_y_continuous(limits = c(0, 1)) +
labs(
y = "Overall survival probability",
x = "Time (days)",
) +
#facet_wrap(~strata, nrow = 3) +
#scale_x_continuous(
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
#) +
add_censor_mark() +
theme_bw_small_labels() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
```
```{r}
#| eval: false
sel_a <- a %>%
mutate(is_ms = server == "mastodon.social") %>%
filter(jm)
cx <- coxph(Surv(active_time_weeks, status) ~ log10(server_count), data = sel_a)
library(coxme)
sel_a <- a %>%
filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
#mutate(is_general = category=="general") %>%
rowwise() %>%
mutate(is_regional = "regional" %in% categories) %>%
mutate(is_general = "general" %in% categories) %>%
mutate(is_neither = !(is_regional | is_regional)) %>%
mutate(is_en = language == "en") %>%
rowwise() %>%
mutate(n_categories = length(categories) - is_regional - is_general) %>%
mutate(many_categories = n_categories > 0) %>%
mutate(is_large = last_week_users >= 585) %>%
mutate(follows_someone = followers_count > 0) %>% filter(server_count > 0) %>%
ungroup
#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE)
cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE)
coxme(Surv(active_time_weeks, status) ~ is_neither + is_general:log1p(server_count) + (1|server), data = sel_a, x=TRUE)
cx <- coxph(Surv(active_time_weeks, status) ~ is_neither + many_categories + is_general:log10(server_count), data = sel_a, x=TRUE)
cz <- cox.zph(cx)
#plot(cz)
cx
cz
```
```{r}
#| eval: false
obj <- rfsrc(Surv(active_time_weeks, status) ~ server_count_bin + jm, data = (a %>% sample_n(1000)), ntree=5000)
plot(get.tree(obj, 0))
options(rf.cores=2, mc.cores=2)
for_data <- sel_a #%>% slice_sample(n=2500)
obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=100, forest=TRUE)
#predictions <- predict(obj, newdata = newData)$predicted
#plot(get.tree(obj, 1))
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
plot.subsample(reg.smp.o)
```
## Moved Accounts