junior-sheer/code/load_accounts.py
Carl Colglazier fc929dbfed Init
2024-01-31 17:08:47 -06:00

73 lines
2.9 KiB
Python

import polars as pl
import json
def get_fullweek_status_count(x: str):
try:
data = json.loads(x)
return int(data[1]["logins"])
except:
return -1
def read_activity_file(f):
return pl.read_ipc("data/activity.feather").filter(
pl.col("data_string").str.starts_with('[{"week":')
).with_columns(
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x)).alias('logins')
).sort("logins")
def read_metadata_file(f):
return pl.read_ipc(f).filter(
pl.col("data_string").str.starts_with('{"uri":')
).with_columns(
# replace all null \0 with nothing
pl.col("data_string").str.replace_all("\0", "").alias("data_string")
).with_columns(
pl.col("data_string").str.json_decode().alias("data")
).with_columns(
pl.col("data").struct.field("uri").alias("uri"),
pl.col("data").struct.field("title").alias("title"),
pl.col("data").struct.field("short_description").alias("short_description"),
pl.col("data").struct.field("description").alias("description"),
pl.col("data").struct.field("email").alias("email"),
pl.col("data").struct.field("version").alias("version"),
pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"),
)
def read_accounts_file(f):
df = pl.read_ipc(f).with_columns(
pl.col("data_string").str.json_decode().alias("data")
).with_columns(
pl.col("data").struct.field("id"),
pl.col("data").struct.field("username"),
pl.col("data").struct.field("acct"),
pl.col("data").struct.field("display_name"),
pl.col("data").struct.field("locked"),
pl.col("data").struct.field("bot"),
pl.col("data").struct.field("discoverable"),
pl.col("data").struct.field("group"),
pl.col("data").struct.field("created_at").str.to_datetime(),
pl.col("data").struct.field("note"),
pl.col("data").struct.field("url"),
pl.col("data").struct.field("uri"),
pl.col("data").struct.field("avatar"),
pl.col("data").struct.field("avatar_static"),
pl.col("data").struct.field("header"),
pl.col("data").struct.field("header_static"),
pl.col("data").struct.field("followers_count"),
pl.col("data").struct.field("following_count"),
pl.col("data").struct.field("statuses_count"),
pl.col("data").struct.field("last_status_at"),
pl.col("data").struct.field("noindex"),
pl.col("data").struct.field("emojis"),
pl.col("data").struct.field("roles"),
pl.col("data").struct.field("fields"),
pl.col("data").struct.field("suspended"),
).with_columns(
pl.when(
pl.col("last_status_at").str.len_chars() > 10).then(
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%.fZ', strict=False).dt.replace_time_zone("UTC")
).otherwise(
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%d', strict=False).dt.replace_time_zone("UTC")
).alias("last_status_at")
)
return df