From 6a83cc2c244cd8771983fdca7c8872a493736286 Mon Sep 17 00:00:00 2001 From: Carl Colglazier Date: Wed, 7 Feb 2024 11:41:14 -0500 Subject: [PATCH] Iteration on random forest --- index.qmd | 164 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 124 insertions(+), 40 deletions(-) diff --git a/index.qmd b/index.qmd index e41502f..9ee3693 100644 --- a/index.qmd +++ b/index.qmd @@ -42,6 +42,8 @@ execute: ```{r} #| label: r-setup #| output: false +#| error: false +#| warning: false library(reticulate) library(tidyverse) library(arrow) @@ -199,7 +201,7 @@ all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "ser ```{r} #| eval: false arrow::read_feather( - "data/scratch/accounts.feather", + "data/scratch/accounts_processed_profiles.feather", col_select = c( "server", "username", "created_at", "last_status_at", "statuses_count", @@ -217,6 +219,7 @@ arrow::read_feather( filter(statuses_count > 1) %>% filter(!suspended) %>% filter(!has_moved) %>% + filter(server == "mastodon.social") %>% #filter(last_status_at >= created_at) %>% mutate(created_month = format(created_at, "%Y-%m")) %>% group_by(created_month) %>% @@ -230,19 +233,26 @@ arrow::read_feather( ```{r} #| label: fig-account-timeline -#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, and a day when Twitter experienced an outage and started rate limiting accounts." +#| fig-cap: "Accounts in the dataset created between January 2022 and March 2023. The top panels shows the proportion of accounts still active 45 days after creation, the proportion of accounts that have moved, and the proportion of accounts that have been suspended. The bottom panel shows the count of accounts created each week. The dashed vertical lines in the bottom panel represent the annoucement day of the Elon Musk Twitter acquisition, the acquisition closing day, a day where Twitter suspended a number of prominent journalist, and a day when Twitter experienced an outage and started rate limiting accounts." #| fig-height: 3 #| fig-width: 6.75 -accounts_unfilt <- arrow::read_feather("data/scratch/all_accounts.feather", col_select=c("server", "username", "created_at", "last_status_at", "statuses_count", "has_moved", "bot", "suspended", "host")) %>% - filter(server == host) + +jm <- arrow::read_feather("data/scratch/joinmastodon.feather") +accounts_unfilt <- arrow::read_feather( + "data/scratch/all_accounts.feather", + col_select=c( + "server", "username", "created_at", "last_status_at", + "statuses_count", "has_moved", "bot", "suspended", + "following_count", "followers_count" + )) accounts <- accounts_unfilt %>% filter(!bot) %>% # TODO: what's going on here? filter(!is.na(last_status_at)) %>% mutate(suspended = replace_na(suspended, FALSE)) %>% # sanity check - filter(created_at >= "2021-01-01") %>% - filter(created_at < "2023-08-01") %>% + filter(created_at >= "2020-10-01") %>% + filter(created_at < "2024-01-01") %>% # We don't want accounts that were created and then immediately stopped being active filter(statuses_count >= 1) %>% filter(last_status_at >= created_at) %>% @@ -254,14 +264,20 @@ acc_data <- accounts %>% #filter(!has_moved) %>% mutate(created_month = format(created_at, "%Y-%m")) %>% mutate(created_week = floor_date(created_at, unit = "week")) %>% + mutate(active_now = active) %>% mutate(active = active_time >= 45) %>% + mutate("Is mastodon.social" = server == "mastodon.social") %>% + mutate(jm = server %in% jm$domain) %>% group_by(created_week) %>% summarize( + `JoinMastodon Server` = sum(jm) / n(), + `Is mastodon.social` = sum(`Is mastodon.social`)/n(), Suspended = sum(suspended)/n(), Active = (sum(active)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)), + active_now = (sum(active_now)-sum(has_moved)-sum(suspended))/(n()-sum(has_moved)-sum(suspended)), Moved=sum(has_moved)/n(), count=n()) %>% - pivot_longer(cols=c("Active", "Moved"), names_to="Measure", values_to="value") # "Suspended" + pivot_longer(cols=c("JoinMastodon Server", "active_now", "Active", "Moved", "Is mastodon.social"), names_to="Measure", values_to="value") # "Suspended" theme_bw_small_labels <- function(base_size = 9) { theme_bw(base_size = base_size) %+replace% theme( @@ -274,6 +290,7 @@ theme_bw_small_labels <- function(base_size = 9) { legend.text = element_text(size = base_size * 0.8) ) } + p1 <- acc_data %>% ggplot(aes(x=as.Date(created_week), group=1)) + geom_line(aes(y=value, group=Measure, color=Measure)) + @@ -298,6 +315,9 @@ p2 <- acc_data %>% aes(xintercept = as.numeric(as.Date("2022-04-14"))), linetype="dashed", color = "black") + # https://twitter.com/elonmusk/status/1675187969420828672 + geom_vline( + aes(xintercept = as.numeric(as.Date("2022-12-15"))), + linetype="dashed", color = "black") + geom_vline( aes(xintercept = as.numeric(as.Date("2023-07-01"))), linetype="dashed", color = "black") + @@ -320,6 +340,7 @@ p1 + p2 + plot_layout(ncol = 1) ```{r} #| label: fig-active-accounts +#| eval: false #library(betareg) library(lme4) activity <- arrow::read_feather( @@ -364,15 +385,23 @@ a_data %>% #scale_y_log10() + theme_bw_small_labels() ``` + ```{r} +#| eval: false library(fable) -library(fable.binary) +#library(fable.binary) +library(tsibble) +library(lubridate) ad_time <- account_data |> - as_tsibble(key=c("username", "server"), index=created_at) + mutate(created_at = yearweek(created_at)) |> + group_by(server, created_at) |> + summarize(count = n(), active = sum(active)) |> + as_tsibble(key="server", index=created_at) ``` ```{r} +#| eval: false fit <- ad_time |> model( logistic = LOGISTIC(active ~ fourier(K = 5, period = "year")) @@ -380,7 +409,10 @@ fit <- ad_time |> ``` ```{r} +#| eval: false ad_time |> + filter(server == "mastodon.social") |> + sample_n(100) |> autoplot(active) ``` @@ -391,24 +423,25 @@ ad_time |> #| fig-height: 4 #| eval: false study_period <- 45 +last_day <- "2024-01-15" #formerly accounts_processed_recent - - -server_counts <- arrow::read_feather( - "data/scratch/accounts.feather", - col_select=c("server", "username", "created_at", "bot") - ) %>% - filter(created_at <= "2023-03-01") %>% - filter(!bot) %>% - group_by(server) %>% - summarize(server_count = n()) %>% - arrange(desc(server_count)) %>% - mutate(server_count_bin = floor(log10(server_count))) +#server_counts <- arrow::read_feather( +# "data/scratch/accounts.feather", +# col_select=c("server", "username", "created_at", "bot") +# ) %>% +# filter(created_at <= "2023-03-01") %>% +# filter(!bot) %>% +# group_by(server) %>% +# summarize(server_count = n()) %>% +# arrange(desc(server_count)) %>% +# mutate(server_count_bin = floor(log10(server_count))) metadata <- arrow::read_feather("data/scratch/metadata.feather", col_select=c("server", "user_count")) %>% arrange(desc(user_count)) %>% mutate(server_count = user_count) %>% - mutate(server_count_bin = floor(log10(server_count))) + mutate(server_count_bin = floor(log10(server_count))) %>% + mutate(server_count_bin = ifelse(server_count_bin >= 4, 4, server_count_bin)) %>% + mutate(server_count_bin = ifelse(server_count_bin <= 2, 2, server_count_bin)) activity <- arrow::read_feather( "data/scratch/activity.feather", @@ -418,23 +451,29 @@ activity <- arrow::read_feather( mutate(server_count = logins) %>% mutate(server_count_bin = floor(log10(server_count))) %>% # Merge 4 and 5 - mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin))# %>% - # Merge 2 and 3 - #mutate(server_count_bin = ifelse(server_count_bin == 3, 2, server_count_bin)) + #mutate(server_count_bin = ifelse(server_count_bin >= 5, 4, server_count_bin)) %>% + # Merge below 2 + #mutate(server_count_bin = ifelse((server_count_bin <= 2) & (server_count_bin >= 1), 2, server_count_bin)) %>% + mutate(server_count_bin = ifelse(server_count == 0, -1, server_count_bin)) jm <- arrow::read_feather("data/scratch/joinmastodon.feather") a <- accounts %>% + filter(!has_moved) %>% + #filter(created_at >= "2023-06-01") %>% + #filter(created_at < "2023-08-01") %>% + filter(created_at >= "2023-10-15") %>% + filter(created_at < "2023-12-01") %>% inner_join(activity, by="server") %>% - mutate(large_server = server_count > 1000) %>% + filter(created_at < last_status_at) %>% + #mutate(large_server = server_count > 1000) %>% mutate(active_time = as.integer(active_time)) %>% mutate(active_time_weeks = active_time) %>% mutate(status = ifelse(active, 0, 1)) %>% - mutate(jm = server %in% jm$domain) %>% - filter(server_count > 0) + mutate(jm = server %in% jm$domain) #%>% filter(server_count > 0) -survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>% +survfit2(Surv(active_time_weeks, status) ~ strata(server_count_bin) + 1, data = a) %>% # strata(server_count_bin) ggsurvfit() + add_confidence_interval() + scale_y_continuous(limits = c(0, 1)) + @@ -442,29 +481,74 @@ survfit2(Surv(active_time_weeks, status) ~ server_count_bin, data = a) %>% y = "Overall survival probability", x = "Time (days)", ) + - scale_x_continuous( - breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), - labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4) - ) + + #scale_x_continuous( + # breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), + # labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4) + #) + theme_bw_small_labels() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ``` +```{r} +a %>% filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>% + mutate(is_general = category=="general") %>% + mutate(is_en = language == "en") %>% + mutate(is_large = last_week_users >= 585) %>% #filter(following_count < 10) %>% +survfit2(Surv(active_time_weeks, status) ~ is_general + is_large, data = .) %>% # strata(server_count_bin) + ggsurvfit(linetype_aes=TRUE, type = "survival") + + add_confidence_interval() + + scale_y_continuous(limits = c(0, 1)) + + labs( + y = "Overall survival probability", + x = "Time (days)", + ) + + #facet_wrap(~strata, nrow = 3) + + #scale_x_continuous( + # breaks = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4), + # labels = seq(0, max(a$active_time_weeks, na.rm = TRUE), by = 4) + #) + + add_censor_mark() + + theme_bw_small_labels() + + theme(axis.text.x = element_text(angle = 45, hjust = 1)) +``` + + ```{r} #| eval: false -sel_a <- a %>% - mutate(is_ms = server == "mastodon.social") %>% - filter(jm) -cx <- coxph(Surv(active_time_weeks, status) ~ log10(server_count), data = sel_a) +library(coxme) + +sel_a <- a %>% + filter(jm) %>% inner_join(., jm, by=c("server"="domain")) %>% + #mutate(is_general = category=="general") %>% + rowwise() %>% + mutate(is_regional = "regional" %in% categories) %>% + mutate(is_general = "general" %in% categories) %>% + mutate(is_neither = !(is_regional | is_regional)) %>% + mutate(is_en = language == "en") %>% + rowwise() %>% + mutate(n_categories = length(categories) - is_regional - is_general) %>% + mutate(many_categories = n_categories > 0) %>% + mutate(is_large = last_week_users >= 585) %>% + mutate(follows_someone = followers_count > 0) %>% filter(server_count > 0) %>% + ungroup +#cx <- coxme(Surv(active_time_weeks, status) ~ is_large + is_general + approval_required + (1|server), data = sel_a, x=TRUE) +cx <- coxph(Surv(active_time_weeks, status) ~ many_categories + is_general*is_regional + is_general:log1p(server_count), data = sel_a, x=TRUE) +coxme(Surv(active_time_weeks, status) ~ is_neither + is_general:log1p(server_count) + (1|server), data = sel_a, x=TRUE) +cx <- coxph(Surv(active_time_weeks, status) ~ is_neither + many_categories + is_general:log10(server_count), data = sel_a, x=TRUE) cz <- cox.zph(cx) #plot(cz) -cx +cz ``` ```{r} #| eval: false -obj <- rfsrc(Surv(active_time_weeks, status) ~ server_count_bin + jm, data = (a %>% sample_n(1000)), ntree=5000) -plot(get.tree(obj, 0)) +options(rf.cores=2, mc.cores=2) +for_data <- sel_a #%>% slice_sample(n=2500) +obj <- rfsrc.fast(Surv(active_time_weeks, status) ~ is_neither + is_general*server_count, data = for_data, ntree=100, forest=TRUE) +#predictions <- predict(obj, newdata = newData)$predicted +#plot(get.tree(obj, 1)) +reg.smp.o <- subsample(obj, B = 10, verbose = TRUE)#, subratio = .5) +plot.subsample(reg.smp.o) ``` ## Moved Accounts