Iteration on random forest
This commit is contained in:
parent
38692cba68
commit
6a83cc2c24
164
index.qmd
164
index.qmd
@ -42,6 +42,8 @@ execute:
|
|||||||
```{r}
|
```{r}
|
||||||
#| label: r-setup
|
#| label: r-setup
|
||||||
#| output: false
|
#| output: false
|
||||||
|
#| error: false
|
||||||
|
#| warning: false
|
||||||
library(reticulate)
|
library(reticulate)
|
||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
library(arrow)
|
library(arrow)
|
||||||
@ -199,7 +201,7 @@ all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "ser
|
|||||||
```{r}
|
```{r}
|
||||||
#| eval: false
|
#| eval: false
|
||||||
arrow::read_feather(
|
arrow::read_feather(
|
||||||
"data/scratch/accounts.feather",
|
"data/scratch/accounts_processed_profiles.feather",
|
||||||
col_select = c(
|
col_select = c(
|
||||||
"server", "username", "created_at",
|
"server", "username", "created_at",
|
||||||
"last_status_at", "statuses_count",
|
"last_status_at", "statuses_count",
|
||||||
@ -217,6 +219,7 @@ arrow::read_feather(
|
|||||||
filter(statuses_count > 1) %>%
|
filter(statuses_count > 1) %>%
|
||||||
filter(!suspended) %>%
|
filter(!suspended) %>%
|
||||||
filter(!has_moved) %>%
|
filter(!has_moved) %>%
|
||||||
|
filter(server == "mastodon.social") %>%
|
||||||
#filter(last_status_at >= created_at) %>%
|
#filter(last_status_at >= created_at) %>%
|
||||||
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
||||||
group_by(created_month) %>%
|
group_by(created_month) %>%
|
||||||
@ -230,19 +233,26 @@ arrow::read_feather(
|
|||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
#| label: fig-account-timeline
|
#| label: fig-account-timeline
|
||||||
#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, and a day when Twitter experienced an outage and started rate limiting accounts."
|
#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts."
|
||||||
#| fig-height: 3
|
#| fig-height: 3
|
||||||
#| fig-width: 6.75
|
#| fig-width: 6.75
|
||||||
accounts_unfilt <- arrow::read_feather("data/scratch/all_accounts.feather", col_select=c("server", "username", "created_at", "last_status_at", "statuses_count", "has_moved", "bot", "suspended", "host")) %>%
|
|
||||||
filter(server == host)
|
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
||||||
|
accounts_unfilt <- arrow::read_feather(
|
||||||
|
"data/scratch/all_accounts.feather",
|
||||||
|
col_select=c(
|
||||||
|
"server", "username", "created_at", "last_status_at",
|
||||||
|
"statuses_count", "has_moved", "bot", "suspended",
|
||||||
|
"following_count", "followers_count"
|
||||||
|
))
|
||||||
accounts <- accounts_unfilt %>%
|
accounts <- accounts_unfilt %>%
|
||||||
filter(!bot) %>%
|
filter(!bot) %>%
|
||||||
# TODO: what's going on here?
|
# TODO: what's going on here?
|
||||||
filter(!is.na(last_status_at)) %>%
|
filter(!is.na(last_status_at)) %>%
|
||||||
mutate(suspended = replace_na(suspended, FALSE)) %>%
|
mutate(suspended = replace_na(suspended, FALSE)) %>%
|
||||||
# sanity check
|
# sanity check
|
||||||
filter(created_at >= "2021-01-01") %>%
|
filter(created_at >= "2020-10-01") %>%
|
||||||
filter(created_at < "2023-08-01") %>%
|
filter(created_at < "2024-01-01") %>%
|
||||||
# We don't want accounts that were created and then immediately stopped being active
|
# We don't want accounts that were created and then immediately stopped being active
|
||||||
filter(statuses_count >= 1) %>%
|
filter(statuses_count >= 1) %>%
|
||||||
filter(last_status_at >= created_at) %>%
|
filter(last_status_at >= created_at) %>%
|
||||||
@ -254,14 +264,20 @@ acc_data <- accounts %>%
|
|||||||
#filter(!has_moved) %>%
|
#filter(!has_moved) %>%
|
||||||
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
mutate(created_month = format(created_at, "%Y-%m")) %>%
|
||||||
mutate(created_week = floor_date(created_at, unit = "week")) %>%
|
mutate(created_week = floor_date(created_at, unit = "week")) %>%
|
||||||
|
mutate(active_now = active) %>%
|
||||||
mutate(active = active_time >= 45) %>%
|
mutate(active = active_time >= 45) %>%
|
||||||
|
mutate("Is mastodon.social" = server == "mastodon.social") %>%
|
||||||
|
mutate(jm = server %in% jm$domain) %>%
|
||||||
group_by(created_week) %>%
|
group_by(created_week) %>%
|
||||||
summarize(
|
summarize(
|
||||||
|
`JoinMastodon Server` = sum(jm) / n(),
|
||||||
|
`Is mastodon.social` = sum(`Is mastodon.social`)/n(),
|
||||||
Suspended = sum(suspended)/n(),
|
Suspended = sum(suspended)/n(),
|
||||||
Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
||||||
|
active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)),
|
||||||
Moved=sum(has_moved)/n(),
|
Moved=sum(has_moved)/n(),
|
||||||
count=n()) %>%
|
count=n()) %>%
|
||||||
pivot_longer(cols=c("Active", "Moved"), names_to="Measure", values_to="value") # "Suspended"
|
pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended"
|
||||||
theme_bw_small_labels <- function(base_size = 9) {
|
theme_bw_small_labels <- function(base_size = 9) {
|
||||||
theme_bw(base_size = base_size) %+replace%
|
theme_bw(base_size = base_size) %+replace%
|
||||||
theme(
|
theme(
|
||||||
@ -274,6 +290,7 @@ theme_bw_small_labels <- function(base_size = 9) {
|
|||||||
legend.text = element_text(size = base_size * 0.8)
|
legend.text = element_text(size = base_size * 0.8)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
p1 <- acc_data %>%
|
p1 <- acc_data %>%
|
||||||
ggplot(aes(x=as.Date(created_week), group=1)) +
|
ggplot(aes(x=as.Date(created_week), group=1)) +
|
||||||
geom_line(aes(y=value, group=Measure, color=Measure)) +
|
geom_line(aes(y=value, group=Measure, color=Measure)) +
|
||||||
@ -298,6 +315,9 @@ p2 <- acc_data %>%
|
|||||||
aes(xintercept = as.numeric(as.Date("2022-04-14"))),
|
aes(xintercept = as.numeric(as.Date("2022-04-14"))),
|
||||||
linetype="dashed", color = "black") +
|
linetype="dashed", color = "black") +
|
||||||
# https://twitter.com/elonmusk/status/1675187969420828672
|
# https://twitter.com/elonmusk/status/1675187969420828672
|
||||||
|
geom_vline(
|
||||||
|
aes(xintercept = as.numeric(as.Date("2022-12-15"))),
|
||||||
|
linetype="dashed", color = "black") +
|
||||||
geom_vline(
|
geom_vline(
|
||||||
aes(xintercept = as.numeric(as.Date("2023-07-01"))),
|
aes(xintercept = as.numeric(as.Date("2023-07-01"))),
|
||||||
linetype="dashed", color = "black") +
|
linetype="dashed", color = "black") +
|
||||||
@ -320,6 +340,7 @@ p1 + p2 + plot_layout(ncol = 1)
|
|||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
#| label: fig-active-accounts
|
#| label: fig-active-accounts
|
||||||
|
#| eval: false
|
||||||
#library(betareg)
|
#library(betareg)
|
||||||
library(lme4)
|
library(lme4)
|
||||||
activity <- arrow::read_feather(
|
activity <- arrow::read_feather(
|
||||||
@ -364,15 +385,23 @@ a_data %>%
|
|||||||
#scale_y_log10() +
|
#scale_y_log10() +
|
||||||
theme_bw_small_labels()
|
theme_bw_small_labels()
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
|
#| eval: false
|
||||||
library(fable)
|
library(fable)
|
||||||
library(fable.binary)
|
#library(fable.binary)
|
||||||
|
library(tsibble)
|
||||||
|
library(lubridate)
|
||||||
|
|
||||||
ad_time <- account_data |>
|
ad_time <- account_data |>
|
||||||
as_tsibble(key=c("username", "server"), index=created_at)
|
mutate(created_at = yearweek(created_at)) |>
|
||||||
|
group_by(server, created_at) |>
|
||||||
|
summarize(count = n(), active = sum(active)) |>
|
||||||
|
as_tsibble(key="server", index=created_at)
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
|
#| eval: false
|
||||||
fit <- ad_time |>
|
fit <- ad_time |>
|
||||||
model(
|
model(
|
||||||
logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
|
logistic = LOGISTIC(active ~ fourier(K = 5, period = "year"))
|
||||||
@ -380,7 +409,10 @@ fit <- ad_time |>
|
|||||||
```
|
```
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
|
#| eval: false
|
||||||
ad_time |>
|
ad_time |>
|
||||||
|
filter(server == "mastodon.social") |>
|
||||||
|
sample_n(100) |>
|
||||||
autoplot(active)
|
autoplot(active)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -391,24 +423,25 @@ ad_time |>
|
|||||||
#| fig-height: 4
|
#| fig-height: 4
|
||||||
#| eval: false
|
#| eval: false
|
||||||
study_period <- 45
|
study_period <- 45
|
||||||
|
last_day <- "2024-01-15"
|
||||||
#formerly accounts_processed_recent
|
#formerly accounts_processed_recent
|
||||||
|
#server_counts <- arrow::read_feather(
|
||||||
|
# "data/scratch/accounts.feather",
|
||||||
server_counts <- arrow::read_feather(
|
# col_select=c("server", "username", "created_at", "bot")
|
||||||
"data/scratch/accounts.feather",
|
# ) %>%
|
||||||
col_select=c("server", "username", "created_at", "bot")
|
# filter(created_at <= "2023-03-01") %>%
|
||||||
) %>%
|
# filter(!bot) %>%
|
||||||
filter(created_at <= "2023-03-01") %>%
|
# group_by(server) %>%
|
||||||
filter(!bot) %>%
|
# summarize(server_count = n()) %>%
|
||||||
group_by(server) %>%
|
# arrange(desc(server_count)) %>%
|
||||||
summarize(server_count = n()) %>%
|
# mutate(server_count_bin = floor(log10(server_count)))
|
||||||
arrange(desc(server_count)) %>%
|
|
||||||
mutate(server_count_bin = floor(log10(server_count)))
|
|
||||||
|
|
||||||
metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
|
metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>%
|
||||||
arrange(desc(user_count)) %>%
|
arrange(desc(user_count)) %>%
|
||||||
mutate(server_count = user_count) %>%
|
mutate(server_count = user_count) %>%
|
||||||
mutate(server_count_bin = floor(log10(server_count)))
|
mutate(server_count_bin = floor(log10(server_count))) %>%
|
||||||
|
mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>%
|
||||||
|
mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin))
|
||||||
|
|
||||||
activity <- arrow::read_feather(
|
activity <- arrow::read_feather(
|
||||||
"data/scratch/activity.feather",
|
"data/scratch/activity.feather",
|
||||||
@ -418,23 +451,29 @@ activity <- arrow::read_feather(
|
|||||||
mutate(server_count = logins) %>%
|
mutate(server_count = logins) %>%
|
||||||
mutate(server_count_bin = floor(log10(server_count))) %>%
|
mutate(server_count_bin = floor(log10(server_count))) %>%
|
||||||
# Merge 4 and 5
|
# Merge 4 and 5
|
||||||
mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin))# %>%
|
#mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>%
|
||||||
# Merge 2 and 3
|
# Merge below 2
|
||||||
#mutate(server_count_bin = ifelse(server_count_bin == 3, 2, server_count_bin))
|
#mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>%
|
||||||
|
mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin))
|
||||||
|
|
||||||
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
jm <- arrow::read_feather("data/scratch/joinmastodon.feather")
|
||||||
|
|
||||||
a <- accounts %>%
|
a <- accounts %>%
|
||||||
|
filter(!has_moved) %>%
|
||||||
|
#filter(created_at >= "2023-06-01") %>%
|
||||||
|
#filter(created_at < "2023-08-01") %>%
|
||||||
|
filter(created_at >= "2023-10-15") %>%
|
||||||
|
filter(created_at < "2023-12-01") %>%
|
||||||
inner_join(activity, by="server") %>%
|
inner_join(activity, by="server") %>%
|
||||||
mutate(large_server = server_count > 1000) %>%
|
filter(created_at < last_status_at) %>%
|
||||||
|
#mutate(large_server = server_count > 1000) %>%
|
||||||
mutate(active_time = as.integer(active_time)) %>%
|
mutate(active_time = as.integer(active_time)) %>%
|
||||||
mutate(active_time_weeks = active_time) %>%
|
mutate(active_time_weeks = active_time) %>%
|
||||||
mutate(status = ifelse(active, 0, 1)) %>%
|
mutate(status = ifelse(active, 0, 1)) %>%
|
||||||
mutate(jm = server %in% jm$domain) %>%
|
mutate(jm = server %in% jm$domain) #%>% filter(server_count > 0)
|
||||||
filter(server_count > 0)
|
|
||||||
|
|
||||||
|
|
||||||
survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>%
|
survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin)
|
||||||
ggsurvfit() +
|
ggsurvfit() +
|
||||||
add_confidence_interval() +
|
add_confidence_interval() +
|
||||||
scale_y_continuous(limits = c(0, 1)) +
|
scale_y_continuous(limits = c(0, 1)) +
|
||||||
@ -442,29 +481,74 @@ survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>%
|
|||||||
y = "Overall survival probability",
|
y = "Overall survival probability",
|
||||||
x = "Time (days)",
|
x = "Time (days)",
|
||||||
) +
|
) +
|
||||||
scale_x_continuous(
|
#scale_x_continuous(
|
||||||
breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
||||||
labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
||||||
) +
|
#) +
|
||||||
theme_bw_small_labels() +
|
theme_bw_small_labels() +
|
||||||
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
a %>% filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
|
||||||
|
mutate(is_general = category=="general") %>%
|
||||||
|
mutate(is_en = language == "en") %>%
|
||||||
|
mutate(is_large = last_week_users >= 585) %>% #filter(following_count < 10) %>%
|
||||||
|
survfit2(Surv(active_time_weeks, status) ~ is_general + is_large, data = .) %>% # strata(server_count_bin)
|
||||||
|
ggsurvfit(linetype_aes=TRUE, type = "survival") +
|
||||||
|
add_confidence_interval() +
|
||||||
|
scale_y_continuous(limits = c(0, 1)) +
|
||||||
|
labs(
|
||||||
|
y = "Overall survival probability",
|
||||||
|
x = "Time (days)",
|
||||||
|
) +
|
||||||
|
#facet_wrap(~strata, nrow = 3) +
|
||||||
|
#scale_x_continuous(
|
||||||
|
# breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4),
|
||||||
|
# labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4)
|
||||||
|
#) +
|
||||||
|
add_censor_mark() +
|
||||||
|
theme_bw_small_labels() +
|
||||||
|
theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
#| eval: false
|
#| eval: false
|
||||||
sel_a <- a %>%
|
library(coxme)
|
||||||
mutate(is_ms = server == "mastodon.social") %>%
|
|
||||||
filter(jm)
|
sel_a <- a %>%
|
||||||
cx <- coxph(Surv(active_time_weeks, status) ~ log10(server_count), data = sel_a)
|
filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>%
|
||||||
|
#mutate(is_general = category=="general") %>%
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(is_regional = "regional" %in% categories) %>%
|
||||||
|
mutate(is_general = "general" %in% categories) %>%
|
||||||
|
mutate(is_neither = !(is_regional | is_regional)) %>%
|
||||||
|
mutate(is_en = language == "en") %>%
|
||||||
|
rowwise() %>%
|
||||||
|
mutate(n_categories = length(categories) - is_regional - is_general) %>%
|
||||||
|
mutate(many_categories = n_categories > 0) %>%
|
||||||
|
mutate(is_large = last_week_users >= 585) %>%
|
||||||
|
mutate(follows_someone = followers_count > 0) %>% filter(server_count > 0) %>%
|
||||||
|
ungroup
|
||||||
|
#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE)
|
||||||
|
cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE)
|
||||||
|
coxme(Surv(active_time_weeks, status) ~ is_neither + is_general:log1p(server_count) + (1|server), data = sel_a, x=TRUE)
|
||||||
|
cx <- coxph(Surv(active_time_weeks, status) ~ is_neither + many_categories + is_general:log10(server_count), data = sel_a, x=TRUE)
|
||||||
cz <- cox.zph(cx)
|
cz <- cox.zph(cx)
|
||||||
#plot(cz)
|
#plot(cz)
|
||||||
cx
|
cz
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
#| eval: false
|
#| eval: false
|
||||||
obj <- rfsrc(Surv(active_time_weeks, status) ~ server_count_bin + jm, data = (a %>% sample_n(1000)), ntree=5000)
|
options(rf.cores=2, mc.cores=2)
|
||||||
plot(get.tree(obj, 0))
|
for_data <- sel_a #%>% slice_sample(n=2500)
|
||||||
|
obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=100, forest=TRUE)
|
||||||
|
#predictions <- predict(obj, newdata = newData)$predicted
|
||||||
|
#plot(get.tree(obj, 1))
|
||||||
|
reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5)
|
||||||
|
plot.subsample(reg.smp.o)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Moved Accounts
|
## Moved Accounts
|
||||||
|
Loading…
Reference in New Issue
Block a user