import polars as pl import json def get_fullweek_status_count(x: str, attr): try: data = json.loads(x) return int(data[1][attr]) except: return -1 def read_activity_file(f): return pl.read_ipc(f).filter( pl.col("data_string").str.starts_with('[{"week":') ).with_columns( pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "week")).alias('week'), pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "logins")).alias('logins'), pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "statuses")).alias('statuses'), pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "registrations")).alias('registrations'), ).sort("logins") def read_metadata_file(f): return pl.read_ipc(f).filter( pl.col("data_string").str.starts_with('{"uri":') ).with_columns( # replace all null \0 with nothing pl.col("data_string").str.replace_all("\0", "").alias("data_string") ).with_columns( pl.col("data_string").str.json_decode().alias("data") ).with_columns( pl.col("data").struct.field("uri").alias("uri"), pl.col("data").struct.field("title").alias("title"), pl.col("data").struct.field("short_description").alias("short_description"), pl.col("data").struct.field("description").alias("description"), pl.col("data").struct.field("email").alias("email"), pl.col("data").struct.field("version").alias("version"), pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"), pl.col("data").struct.field("languages").alias("languages"), pl.col("data").struct.field("registrations").alias("registrations"), pl.col("data").struct.field("approval_required").alias("approval_required"), pl.col("data").struct.field("invites_enabled").alias("invites_enabled"), ) def read_accounts_file(f): return pl.read_ipc(f).filter( pl.col("data_string").str.contains('"pleroma":').not_() #).filter( #pl.col("data_string").str.starts_with('{"id"') ).with_columns( pl.col("data_string").str.json_decode().alias("data") ).with_columns( pl.col("data").struct.field("id"), pl.col("data").struct.field("username"), pl.col("data").struct.field("acct"), pl.col("data").struct.field("display_name"), pl.col("data").struct.field("locked"), pl.col("data").struct.field("bot"), pl.col("data").struct.field("discoverable"), pl.col("data").struct.field("group"), pl.col("data").struct.field("created_at").str.to_datetime(), pl.col("data").struct.field("note"), pl.col("data").struct.field("url"), pl.col("data").struct.field("uri"), pl.col("data").struct.field("avatar"), pl.col("data").struct.field("avatar_static"), pl.col("data").struct.field("header"), pl.col("data").struct.field("header_static"), pl.col("data").struct.field("followers_count"), pl.col("data").struct.field("following_count"), pl.col("data").struct.field("statuses_count"), pl.col("data").struct.field("last_status_at"), pl.col("data").struct.field("noindex"), pl.col("data_string").str.contains("""\"limited\": true""").alias("limited"), ).with_columns( pl.when( pl.col("last_status_at").str.len_chars() > 10).then( pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%.fZ', strict=False).dt.replace_time_zone("UTC") ).otherwise( pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%d', strict=False).dt.replace_time_zone("UTC") ).alias("last_status_at") )