junior-sheer/codebase/preprocess.py
Carl Colglazier 60023b07d1 Refactor scripts and code.
Split manuscripts into their own directories / projects.
2025-05-26 20:08:57 -05:00

100 lines
3.4 KiB
Python

from load_accounts import *
from urllib.parse import urlparse
import polars as pl
def run_preprocess():
#accounts = pl.concat(
# read_accounts_file("data/accounts.feather"),
# read_accounts_file("data/account_lookup_2023.feather")
#)
accounts = read_accounts_file(
"data/account_lookup_compressed.feather"
).unique(["account", "server"])
# Write a parsed accounts file for R to use
a = accounts.with_columns(
pl.col("url").map_elements(
lambda x: urlparse(x).netloc.encode().decode('idna')
).alias("host"),
pl.col("data_string").str.contains("""\"moved\": \{""").alias("has_moved"),
pl.col("data").struct.field("suspended"),
)
a_save = a.drop(["data", "data_string"])
a_save.select(
sorted(a_save.columns)
).write_ipc("data/scratch/accounts.feather")
moved_accounts = a.filter(pl.col("has_moved")).with_columns(# Do this again now we know the rows are all moved accounts
pl.col("data_string").str.json_decode().alias("data")
).with_columns(
pl.col("data").struct.field("moved")
).drop_nulls("moved").with_columns(
pl.col("moved").struct.field("acct").alias("moved_acct"),
).with_columns(
pl.when(
pl.col("moved_acct").str.contains('@')
).then(
pl.col("moved_acct").str.split('@').list.get(1)
).otherwise(
pl.col("server")
).alias("moved_server"),
pl.when(
pl.col("moved_acct").str.contains('@')
).then(
pl.col("moved_acct").str.split('@').list.get(0)
).otherwise(
pl.col("moved_acct")
).alias("moved_acct")
)
number_of_accounts = len(a)
popular_servers = a.group_by("server").count().sort("count", descending=True)
common_moves = moved_accounts.group_by(
["server", "moved_server"]
).count().sort("count", descending=True)
common_moves.write_ipc("data/scratch/moved_accounts.feather")
common_moves.rename({
"server": "Source",
"moved_server": "Target",
}).write_csv("data/scratch/moved_accounts.csv")
maccounts = moved_accounts.select(["account", "server", "moved_server", "moved_acct"])
maccounts.write_ipc("data/scratch/individual_moved_accounts.feather")
popular_servers.write_ipc("data/scratch/popular_servers.feather")
jm = pl.read_json("data/joinmastodon.json")
jm.write_ipc("data/scratch/joinmastodon.feather")
read_metadata_file("data/metadata-2024-01-31.feather").drop(
["data", "data_string"]
).write_ipc("data/scratch/metadata.feather")
read_metadata_file("data/metadata_2023-10-01.feather").drop(
["data", "data_string"]
).write_ipc("data/scratch/metadata-2023-10-01.feather")
profile_accounts = read_accounts_file("data/profiles_local.feather")
p = profile_accounts.with_columns(
pl.col("url").map_elements(lambda x: urlparse(x).netloc.encode().decode('idna')).alias("host"),
pl.col("username").alias("account"),
pl.lit(False).alias("has_moved"),
pl.lit(False).alias("suspended"),
).drop(
["data", "data_string"]
)
p.select(sorted(p.columns)).write_ipc("data/scratch/accounts_processed_profiles.feather")
all_accounts = pl.scan_ipc(
[
"data/scratch/accounts.feather",
#"data/scratch/accounts_processed_recent.feather",
"data/scratch/accounts_processed_profiles.feather"
]).collect()
all_accounts.filter(pl.col("host").eq(pl.col("server"))).unique(["account", "server"]).write_ipc("data/scratch/all_accounts.feather")
if __name__ == "__main__":
run_preprocess()