import polars as pl import json def get_fullweek_status_count(x: str): try: data = json.loads(x) return int(data[1]["logins"]) except: return -1 def read_activity_file(f): return pl.read_ipc("data/activity.feather").filter( pl.col("data_string").str.starts_with('[{"week":') ).with_columns( pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x)).alias('logins') ).sort("logins") def read_metadata_file(f): return pl.read_ipc(f).filter( pl.col("data_string").str.starts_with('{"uri":') ).with_columns( # replace all null \0 with nothing pl.col("data_string").str.replace_all("\0", "").alias("data_string") ).with_columns( pl.col("data_string").str.json_decode().alias("data") ).with_columns( pl.col("data").struct.field("uri").alias("uri"), pl.col("data").struct.field("title").alias("title"), pl.col("data").struct.field("short_description").alias("short_description"), pl.col("data").struct.field("description").alias("description"), pl.col("data").struct.field("email").alias("email"), pl.col("data").struct.field("version").alias("version"), pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"), ) def read_accounts_file(f): df = pl.read_ipc(f).with_columns( pl.col("data_string").str.json_decode().alias("data") ).with_columns( pl.col("data").struct.field("id"), pl.col("data").struct.field("username"), pl.col("data").struct.field("acct"), pl.col("data").struct.field("display_name"), pl.col("data").struct.field("locked"), pl.col("data").struct.field("bot"), pl.col("data").struct.field("discoverable"), pl.col("data").struct.field("group"), pl.col("data").struct.field("created_at").str.to_datetime(), pl.col("data").struct.field("note"), pl.col("data").struct.field("url"), pl.col("data").struct.field("uri"), pl.col("data").struct.field("avatar"), pl.col("data").struct.field("avatar_static"), pl.col("data").struct.field("header"), pl.col("data").struct.field("header_static"), pl.col("data").struct.field("followers_count"), pl.col("data").struct.field("following_count"), pl.col("data").struct.field("statuses_count"), pl.col("data").struct.field("last_status_at"), pl.col("data").struct.field("noindex"), pl.col("data").struct.field("emojis"), pl.col("data").struct.field("roles"), pl.col("data").struct.field("fields"), pl.col("data").struct.field("suspended"), ).with_columns( pl.when( pl.col("last_status_at").str.len_chars() > 10).then( pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%.fZ', strict=False).dt.replace_time_zone("UTC") ).otherwise( pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%d', strict=False).dt.replace_time_zone("UTC") ).alias("last_status_at") ) return df