Iteration on random forest
This commit is contained in:
parent
38692cba68
commit
6a83cc2c24
164
index.qmd
164
index.qmd
@ -42,6 +42,8 @@ execute:
|
||||
```{r}
|
||||
#| label: r-setup
|
||||
#| output: false
|
||||
#| error: false
|
||||
#| warning: false
|
||||
library(reticulate)
|
||||
library(tidyverse)
|
||||
library(arrow)
|
||||
@ -199,7 +201,7 @@ all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "ser
|
||||
```{r}
|
||||
#| eval: false
|
||||
arrow::read_feather(
|
||||
"data/scratch/accounts.feather",
|
||||
"data/scratch/accounts_processed_profiles.feather",
|
||||
col_select = c(
|
||||
"server", "username", "created_at",
|
||||
"last_status_at", "statuses_count",
|
||||
@ -217,6 +219,7 @@ arrow::read_feather(
|
||||
filter(statuses_count > 1) %>%
|
||||
filter(!suspended) %>%
|
||||
filter(!has_moved) %>%
|
||||
filter(server == "mastodon.social") %>%
|
||||
#filter(last_status_at >= created_at) %>%
|
||||
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
||||
group_by(created_month) %>%
|
||||
@ -230,19 +233,26 @@ arrow::read_feather(
|
||||
|
||||
```{r}
|
||||
#| label: fig-account-timeline
|
||||
#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, and a day when Twitter experienced an outage and started rate limiting accounts."
|
||||
#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
|
||||
#| fig-height: 3
|
||||
#| fig-width: 6.75
|
||||
accounts_unfilt <- arrow::read_feather("data/scratch/all_accounts.feather", col_select=c("server", "username", "created_at", "last_status_at", "statuses_count", "has_moved", "bot", "suspended", "host")) %>%
|
||||
filter(server == host)
|
||||
|
||||
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
||||
accounts_unfilt <- arrow::read_feather(
|
||||
"data/scratch/all_accounts.feather",
|
||||
col_select=c(
|
||||
"server", "username", "created_at", "last_status_at",
|
||||
"statuses_count", "has_moved", "bot", "suspended",
|
||||
"following_count", "followers_count"
|
||||
))
|
||||
accounts <- accounts_unfilt %>%
|
||||
filter(!bot) %>%
|
||||
# TODO: what's going on here?
|
||||
filter(!is.na(last_status_at)) %>%
|
||||
mutate(suspended = replace_na(suspended, FALSE)) %>%
|
||||
# sanity check
|
||||
filter(created_at >= "2021-01-01") %>%
|
||||
filter(created_at < "2023-08-01") %>%
|
||||
filter(created_at >= "2020-10-01") %>%
|
||||
filter(created_at < "2024-01-01") %>%
|
||||
# We don't want accounts that were created and then immediately stopped being active
|
||||
filter(statuses_count >= 1) %>%
|
||||
filter(last_status_at >= created_at) %>%
|
||||
@ -254,14 +264,20 @@ acc_data <- accounts %>%
|
||||
#filter(!has_moved) %>%
|
||||
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
||||
mutate(created_week = floor_date(created_at, unit = "week")) %>%
|
||||
mutate(active_now = active) %>%
|
||||
mutate(active = active_time >= 45) %>%
|
||||
mutate("Is mastodon.social" = server == "mastodon.social") %>%
|
||||
mutate(jm = server %in% jm$domain) %>%
|
||||
group_by(created_week) %>%
|
||||
summarize(
|
||||
`JoinMastodon Server` = sum(jm) / n(),
|
||||
`Is mastodon.social` = sum(`Is mastodon.social`)/n(),
|
||||
Suspended = sum(suspended)/n(),
|
||||
Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
||||
active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
||||
Moved=sum(has_moved)/n(),
|
||||
count=n()) %>%
|
||||
pivot_longer(cols=c("Active", "Moved"), names_to="Measure", values_to="value") # "Suspended"
|
||||
pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
|
||||
theme_bw_small_labels <- function(base_size = 9) {
|
||||
theme_bw(base_size = base_size) %+replace%
|
||||
theme(
|
||||
@ -274,6 +290,7 @@ theme_bw_small_labels <- function(base_size = 9) {
|
||||
legend.text = element_text(size = base_size * 0.8)
|
||||
)
|
||||
}
|
||||
|
||||
p1 <- acc_data %>%
|
||||
ggplot(aes(x=as.Date(created_week), group=1)) +
|
||||
geom_line(aes(y=value, group=Measure, color=Measure)) +
|
||||
@ -298,6 +315,9 @@ p2 <- acc_data %>%
|
||||
aes(xintercept = as.numeric(as.Date("2022-04-14"))),
|
||||
linetype="dashed", color = "black") +
|
||||
# https://twitter.com/elonmusk/status/1675187969420828672
|
||||
geom_vline(
|
||||
aes(xintercept = as.numeric(as.Date("2022-12-15"))),
|
||||
linetype="dashed", color = "black") +
|
||||
geom_vline(
|
||||
aes(xintercept = as.numeric(as.Date("2023-07-01"))),
|
||||
linetype="dashed", color = "black") +
|
||||
@ -320,6 +340,7 @@ p1 + p2 + plot_layout(ncol = 1)
|
||||
|
||||
```{r}
|
||||
#| label: fig-active-accounts
|
||||
#| eval: false
|
||||
#library(betareg)
|
||||
library(lme4)
|
||||
activity <- arrow::read_feather(
|
||||
@ -364,15 +385,23 @@ a_data %>%
|
||||
#scale_y_log10() +
|
||||
theme_bw_small_labels()
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: false
|
||||
library(fable)
|
||||
library(fable.binary)
|
||||
#library(fable.binary)
|
||||
library(tsibble)
|
||||
library(lubridate)
|
||||
|
||||
ad_time <- account_data |>
|
||||
as_tsibble(key=c("username", "server"), index=created_at)
|
||||
mutate(created_at = yearweek(created_at)) |>
|
||||
group_by(server, created_at) |>
|
||||
summarize(count = n(), active = sum(active)) |>
|
||||
as_tsibble(key="server", index=created_at)
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: false
|
||||
fit <- ad_time |>
|
||||
model(
|
||||
logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
|
||||
@ -380,7 +409,10 @@ fit <- ad_time |>
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: false
|
||||
ad_time |>
|
||||
filter(server == "mastodon.social") |>
|
||||
sample_n(100) |>
|
||||
autoplot(active)
|
||||
```
|
||||
|
||||
@ -391,24 +423,25 @@ ad_time |>
|
||||
#| fig-height: 4
|
||||
#| eval: false
|
||||
study_period <- 45
|
||||
last_day <- "2024-01-15"
|
||||
#formerly accounts_processed_recent
|
||||
|
||||
|
||||
server_counts <- arrow::read_feather(
|
||||
"data/scratch/accounts.feather",
|
||||
col_select=c("server", "username", "created_at", "bot")
|
||||
) %>%
|
||||
filter(created_at <= "2023-03-01") %>%
|
||||
filter(!bot) %>%
|
||||
group_by(server) %>%
|
||||
summarize(server_count = n()) %>%
|
||||
arrange(desc(server_count)) %>%
|
||||
mutate(server_count_bin = floor(log10(server_count)))
|
||||
#server_counts <- arrow::read_feather(
|
||||
# "data/scratch/accounts.feather",
|
||||
# col_select=c("server", "username", "created_at", "bot")
|
||||
# ) %>%
|
||||
# filter(created_at <= "2023-03-01") %>%
|
||||
# filter(!bot) %>%
|
||||
# group_by(server) %>%
|
||||
# summarize(server_count = n()) %>%
|
||||
# arrange(desc(server_count)) %>%
|
||||
# mutate(server_count_bin = floor(log10(server_count)))
|
||||
|
||||
metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
|
||||
arrange(desc(user_count)) %>%
|
||||
mutate(server_count = user_count) %>%
|
||||
mutate(server_count_bin = floor(log10(server_count)))
|
||||
mutate(server_count_bin = floor(log10(server_count))) %>%
|
||||
mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
|
||||
mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
|
||||
|
||||
activity <- arrow::read_feather(
|
||||
"data/scratch/activity.feather",
|
||||
@ -418,23 +451,29 @@ activity <- arrow::read_feather(
|
||||
mutate(server_count = logins) %>%
|
||||
mutate(server_count_bin = floor(log10(server_count))) %>%
|
||||
# Merge 4 and 5
|
||||
mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin))# %>%
|
||||
# Merge 2 and 3
|
||||
#mutate(server_count_bin = ifelse(server_count_bin == 3, 2, server_count_bin))
|
||||
#mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
|
||||
# Merge below 2
|
||||
#mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
|
||||
mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
|
||||
|
||||
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
||||
|
||||
a <- accounts %>%
|
||||
filter(!has_moved) %>%
|
||||
#filter(created_at >= "2023-06-01") %>%
|
||||
#filter(created_at < "2023-08-01") %>%
|
||||
filter(created_at >= "2023-10-15") %>%
|
||||
filter(created_at < "2023-12-01") %>%
|
||||
inner_join(activity, by="server") %>%
|
||||
mutate(large_server = server_count > 1000) %>%
|
||||
filter(created_at < last_status_at) %>%
|
||||
#mutate(large_server = server_count > 1000) %>%
|
||||
mutate(active_time = as.integer(active_time)) %>%
|
||||
mutate(active_time_weeks = active_time) %>%
|
||||
mutate(status = ifelse(active, 0, 1)) %>%
|
||||
mutate(jm = server %in% jm$domain) %>%
|
||||
filter(server_count > 0)
|
||||
mutate(jm = server %in% jm$domain) #%>% filter(server_count > 0)
|
||||
|
||||
|
||||
survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>%
|
||||
survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin)
|
||||
ggsurvfit() +
|
||||
add_confidence_interval() +
|
||||
scale_y_continuous(limits = c(0, 1)) +
|
||||
@ -442,29 +481,74 @@ survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>%
|
||||
y = "Overall survival probability",
|
||||
x = "Time (days)",
|
||||
) +
|
||||
scale_x_continuous(
|
||||
breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
||||
labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
||||
) +
|
||||
#scale_x_continuous(
|
||||
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
||||
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
||||
#) +
|
||||
theme_bw_small_labels() +
|
||||
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
||||
```
|
||||
|
||||
```{r}
|
||||
a %>% filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
|
||||
mutate(is_general = category=="general") %>%
|
||||
mutate(is_en = language == "en") %>%
|
||||
mutate(is_large = last_week_users >= 585) %>% #filter(following_count < 10) %>%
|
||||
survfit2(Surv(active_time_weeks, status) ~ is_general + is_large, data = .) %>% # strata(server_count_bin)
|
||||
ggsurvfit(linetype_aes=TRUE, type = "survival") +
|
||||
add_confidence_interval() +
|
||||
scale_y_continuous(limits = c(0, 1)) +
|
||||
labs(
|
||||
y = "Overall survival probability",
|
||||
x = "Time (days)",
|
||||
) +
|
||||
#facet_wrap(~strata, nrow = 3) +
|
||||
#scale_x_continuous(
|
||||
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
||||
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
||||
#) +
|
||||
add_censor_mark() +
|
||||
theme_bw_small_labels() +
|
||||
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
#| eval: false
|
||||
sel_a <- a %>%
|
||||
mutate(is_ms = server == "mastodon.social") %>%
|
||||
filter(jm)
|
||||
cx <- coxph(Surv(active_time_weeks, status) ~ log10(server_count), data = sel_a)
|
||||
library(coxme)
|
||||
|
||||
sel_a <- a %>%
|
||||
filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
|
||||
#mutate(is_general = category=="general") %>%
|
||||
rowwise() %>%
|
||||
mutate(is_regional = "regional" %in% categories) %>%
|
||||
mutate(is_general = "general" %in% categories) %>%
|
||||
mutate(is_neither = !(is_regional | is_regional)) %>%
|
||||
mutate(is_en = language == "en") %>%
|
||||
rowwise() %>%
|
||||
mutate(n_categories = length(categories) - is_regional - is_general) %>%
|
||||
mutate(many_categories = n_categories > 0) %>%
|
||||
mutate(is_large = last_week_users >= 585) %>%
|
||||
mutate(follows_someone = followers_count > 0) %>% filter(server_count > 0) %>%
|
||||
ungroup
|
||||
#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE)
|
||||
cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE)
|
||||
coxme(Surv(active_time_weeks, status) ~ is_neither + is_general:log1p(server_count) + (1|server), data = sel_a, x=TRUE)
|
||||
cx <- coxph(Surv(active_time_weeks, status) ~ is_neither + many_categories + is_general:log10(server_count), data = sel_a, x=TRUE)
|
||||
cz <- cox.zph(cx)
|
||||
#plot(cz)
|
||||
cx
|
||||
cz
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: false
|
||||
obj <- rfsrc(Surv(active_time_weeks, status) ~ server_count_bin + jm, data = (a %>% sample_n(1000)), ntree=5000)
|
||||
plot(get.tree(obj, 0))
|
||||
options(rf.cores=2, mc.cores=2)
|
||||
for_data <- sel_a #%>% slice_sample(n=2500)
|
||||
obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=100, forest=TRUE)
|
||||
#predictions <- predict(obj, newdata = newData)$predicted
|
||||
#plot(get.tree(obj, 1))
|
||||
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
|
||||
plot.subsample(reg.smp.o)
|
||||
```
|
||||
|
||||
## Moved Accounts
|
||||
|
Loading…
Reference in New Issue
Block a user