junior-sheer/codebase/load_accounts.py
Carl Colglazier 60023b07d1 Refactor scripts and code.
Split manuscripts into their own directories / projects.
2025-05-26 20:08:57 -05:00

81 lines
3.5 KiB
Python

import polars as pl
import json
def get_fullweek_status_count(x: str, attr):
try:
data = json.loads(x)
return int(data[1][attr])
except:
return -1
def read_activity_file(f):
return pl.read_ipc(f).filter(
pl.col("data_string").str.starts_with('[{"week":')
).with_columns(
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "week")).alias('week'),
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "logins")).alias('logins'),
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "statuses")).alias('statuses'),
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "registrations")).alias('registrations'),
).sort("logins")
def read_metadata_file(f):
return pl.read_ipc(f).filter(
pl.col("data_string").str.starts_with('{"uri":')
).with_columns(
# replace all null \0 with nothing
pl.col("data_string").str.replace_all("\0", "").alias("data_string")
).with_columns(
pl.col("data_string").str.json_decode().alias("data")
).with_columns(
pl.col("data").struct.field("uri").alias("uri"),
pl.col("data").struct.field("title").alias("title"),
pl.col("data").struct.field("short_description").alias("short_description"),
pl.col("data").struct.field("description").alias("description"),
pl.col("data").struct.field("email").alias("email"),
pl.col("data").struct.field("version").alias("version"),
pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"),
pl.col("data").struct.field("languages").alias("languages"),
pl.col("data").struct.field("registrations").alias("registrations"),
pl.col("data").struct.field("approval_required").alias("approval_required"),
pl.col("data").struct.field("invites_enabled").alias("invites_enabled"),
)
def read_accounts_file(f):
return pl.read_ipc(f).filter(
pl.col("data_string").str.contains('"pleroma":').not_()
#).filter(
#pl.col("data_string").str.starts_with('{"id"')
).with_columns(
pl.col("data_string").str.json_decode().alias("data")
).with_columns(
pl.col("data").struct.field("id"),
pl.col("data").struct.field("username"),
pl.col("data").struct.field("acct"),
pl.col("data").struct.field("display_name"),
pl.col("data").struct.field("locked"),
pl.col("data").struct.field("bot"),
pl.col("data").struct.field("discoverable"),
pl.col("data").struct.field("group"),
pl.col("data").struct.field("created_at").str.to_datetime(),
pl.col("data").struct.field("note"),
pl.col("data").struct.field("url"),
pl.col("data").struct.field("uri"),
pl.col("data").struct.field("avatar"),
pl.col("data").struct.field("avatar_static"),
pl.col("data").struct.field("header"),
pl.col("data").struct.field("header_static"),
pl.col("data").struct.field("followers_count"),
pl.col("data").struct.field("following_count"),
pl.col("data").struct.field("statuses_count"),
pl.col("data").struct.field("last_status_at"),
pl.col("data").struct.field("noindex"),
pl.col("data_string").str.contains("""\"limited\": true""").alias("limited"),
).with_columns(
pl.when(
pl.col("last_status_at").str.len_chars() > 10).then(
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%.fZ', strict=False).dt.replace_time_zone("UTC")
).otherwise(
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%d', strict=False).dt.replace_time_zone("UTC")
).alias("last_status_at")
)