81 lines
3.5 KiB
Python
81 lines
3.5 KiB
Python
import polars as pl
|
|
import json
|
|
|
|
def get_fullweek_status_count(x: str, attr):
|
|
try:
|
|
data = json.loads(x)
|
|
return int(data[1][attr])
|
|
except:
|
|
return -1
|
|
|
|
def read_activity_file(f):
|
|
return pl.read_ipc(f).filter(
|
|
pl.col("data_string").str.starts_with('[{"week":')
|
|
).with_columns(
|
|
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "week")).alias('week'),
|
|
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "logins")).alias('logins'),
|
|
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "statuses")).alias('statuses'),
|
|
pl.col("data_string").map_elements(lambda x: get_fullweek_status_count(x, "registrations")).alias('registrations'),
|
|
).sort("logins")
|
|
|
|
def read_metadata_file(f):
|
|
return pl.read_ipc(f).filter(
|
|
pl.col("data_string").str.starts_with('{"uri":')
|
|
).with_columns(
|
|
# replace all null \0 with nothing
|
|
pl.col("data_string").str.replace_all("\0", "").alias("data_string")
|
|
).with_columns(
|
|
pl.col("data_string").str.json_decode().alias("data")
|
|
).with_columns(
|
|
pl.col("data").struct.field("uri").alias("uri"),
|
|
pl.col("data").struct.field("title").alias("title"),
|
|
pl.col("data").struct.field("short_description").alias("short_description"),
|
|
pl.col("data").struct.field("description").alias("description"),
|
|
pl.col("data").struct.field("email").alias("email"),
|
|
pl.col("data").struct.field("version").alias("version"),
|
|
pl.col("data").struct.field("stats").struct.field("user_count").alias("user_count"),
|
|
pl.col("data").struct.field("languages").alias("languages"),
|
|
pl.col("data").struct.field("registrations").alias("registrations"),
|
|
pl.col("data").struct.field("approval_required").alias("approval_required"),
|
|
pl.col("data").struct.field("invites_enabled").alias("invites_enabled"),
|
|
)
|
|
|
|
def read_accounts_file(f):
|
|
return pl.read_ipc(f).filter(
|
|
pl.col("data_string").str.contains('"pleroma":').not_()
|
|
#).filter(
|
|
#pl.col("data_string").str.starts_with('{"id"')
|
|
).with_columns(
|
|
pl.col("data_string").str.json_decode().alias("data")
|
|
).with_columns(
|
|
pl.col("data").struct.field("id"),
|
|
pl.col("data").struct.field("username"),
|
|
pl.col("data").struct.field("acct"),
|
|
pl.col("data").struct.field("display_name"),
|
|
pl.col("data").struct.field("locked"),
|
|
pl.col("data").struct.field("bot"),
|
|
pl.col("data").struct.field("discoverable"),
|
|
pl.col("data").struct.field("group"),
|
|
pl.col("data").struct.field("created_at").str.to_datetime(),
|
|
pl.col("data").struct.field("note"),
|
|
pl.col("data").struct.field("url"),
|
|
pl.col("data").struct.field("uri"),
|
|
pl.col("data").struct.field("avatar"),
|
|
pl.col("data").struct.field("avatar_static"),
|
|
pl.col("data").struct.field("header"),
|
|
pl.col("data").struct.field("header_static"),
|
|
pl.col("data").struct.field("followers_count"),
|
|
pl.col("data").struct.field("following_count"),
|
|
pl.col("data").struct.field("statuses_count"),
|
|
pl.col("data").struct.field("last_status_at"),
|
|
pl.col("data").struct.field("noindex"),
|
|
pl.col("data_string").str.contains("""\"limited\": true""").alias("limited"),
|
|
).with_columns(
|
|
pl.when(
|
|
pl.col("last_status_at").str.len_chars() > 10).then(
|
|
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%dT%H:%M:%S%.fZ', strict=False).dt.replace_time_zone("UTC")
|
|
).otherwise(
|
|
pl.col("last_status_at").str.strptime(pl.Datetime, format='%Y-%m-%d', strict=False).dt.replace_time_zone("UTC")
|
|
).alias("last_status_at")
|
|
)
|