73 lines
2.9 KiB
Python
73 lines
2.9 KiB
Python
import polars as pl
|
|
import json
|
|
|
|
def get_fullweek_status_count(x: str):
|
|
try:
|
|
data = json.loads(x)
|
|
return int(data[1]["logins"])
|
|
except:
|
|
return -1
|
|
|
|
def read_activity_file(f):
|
|
return pl.read_ipc("data/activity.feather").filter(
|
|
pl.col("data_string").str.starts_with('[{"week":')
|
|
).with_columns(
|
|
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x)).alias('logins')
|
|
).sort("logins")
|
|
|
|
def read_metadata_file(f):
|
|
return pl.read_ipc(f).filter(
|
|
pl.col("data_string").str.starts_with('{"uri":')
|
|
).with_columns(
|
|
# replace all null \0 with nothing
|
|
pl.col("data_string").str.replace_all("\0", "").alias("data_string")
|
|
).with_columns(
|
|
pl.col("data_string").str.json_decode().alias("data")
|
|
).with_columns(
|
|
pl.col("data").struct.field("uri").alias("uri"),
|
|
pl.col("data").struct.field("title").alias("title"),
|
|
pl.col("data").struct.field("short_description").alias("short_description"),
|
|
pl.col("data").struct.field("description").alias("description"),
|
|
pl.col("data").struct.field("email").alias("email"),
|
|
pl.col("data").struct.field("version").alias("version"),
|
|
pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"),
|
|
)
|
|
|
|
def read_accounts_file(f):
|
|
df = pl.read_ipc(f).with_columns(
|
|
pl.col("data_string").str.json_decode().alias("data")
|
|
).with_columns(
|
|
pl.col("data").struct.field("id"),
|
|
pl.col("data").struct.field("username"),
|
|
pl.col("data").struct.field("acct"),
|
|
pl.col("data").struct.field("display_name"),
|
|
pl.col("data").struct.field("locked"),
|
|
pl.col("data").struct.field("bot"),
|
|
pl.col("data").struct.field("discoverable"),
|
|
pl.col("data").struct.field("group"),
|
|
pl.col("data").struct.field("created_at").str.to_datetime(),
|
|
pl.col("data").struct.field("note"),
|
|
pl.col("data").struct.field("url"),
|
|
pl.col("data").struct.field("uri"),
|
|
pl.col("data").struct.field("avatar"),
|
|
pl.col("data").struct.field("avatar_static"),
|
|
pl.col("data").struct.field("header"),
|
|
pl.col("data").struct.field("header_static"),
|
|
pl.col("data").struct.field("followers_count"),
|
|
pl.col("data").struct.field("following_count"),
|
|
pl.col("data").struct.field("statuses_count"),
|
|
pl.col("data").struct.field("last_status_at"),
|
|
pl.col("data").struct.field("noindex"),
|
|
pl.col("data").struct.field("emojis"),
|
|
pl.col("data").struct.field("roles"),
|
|
pl.col("data").struct.field("fields"),
|
|
pl.col("data").struct.field("suspended"),
|
|
).with_columns(
|
|
pl.when(
|
|
pl.col("last_status_at").str.len_chars() > 10).then(
|
|
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%.fZ', strict=False).dt.replace_time_zone("UTC")
|
|
).otherwise(
|
|
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%d', strict=False).dt.replace_time_zone("UTC")
|
|
).alias("last_status_at")
|
|
)
|
|
return df |