# Evaluation of the Recommender System

In [None]:
import os
import sys
from pathlib import Path
import polars as pl
sys.path.append('recc/')
from recc import build_suggestion
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
rm = build_suggestion.ReccModel("../data/")

In [None]:
rm.tfidf

In [None]:
tags = rm.tfidf.select(pl.col("tags")).unique()
hosts = rm.tfidf.select(pl.col("host")).unique()

In [None]:
test_data = pl.scan_ipc("../data/scratch/all_tag_posts.feather").filter(
    pl.col("created_at") >= pl.date(2023, 8, 1)
).filter(
    pl.col("created_at") <= pl.date(2023, 8, 14)
).explode("tags").filter(
    pl.col("tags").is_in(tags)
).filter(
    pl.col("host").is_in(hosts)
).unique(
    ["host", "acct", "tags"]
).select(
    ["host", "acct", "tags"]
).group_by(
    ["host", "acct"]
).agg([
    pl.col("tags")#.list()
]).with_columns(
    pl.col("tags").list.len().alias("n_tags")
).collect().filter(pl.col("n_tags") >= 2).filter(
    pl.col("n_tags") <= 25
).sort("n_tags", descending=True)
test_data

In [None]:
class ReccSVD50:
    def __init__(self, rm):
        self.rm = rm
        self.u, self.s, self.v = rm.svd(k=50, norm_axis=1)
        self.pos_m = self.v.T @ np.diag(self.s)

    def recommend(self, tags: list[str]):
        tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))
        cs = cosine_similarity(self.pos_m[tag_indices], self.u)
        cs_ind = np.argsort(-np.sum(cs, axis=0))
        return self.rm.server_names[cs_ind]
    
    def score(self, test_data):
        ranks = []
        for row in test_data.iter_rows(named=True):
            tags = row["tags"]
            recc = self.recommend(tags)
            rank = np.where(recc == row["host"])[0][0]
            ranks.append(rank)
        return ranks
    
class ReccRandom(ReccSVD50):
    def __init__(self, rm):
        self.rm = rm

    def recommend(self, tags: list[str]):
        return np.random.permutation(self.rm.server_names)
    
class ReccAlternate(ReccSVD50):
    def recommend(self, tags: list[str]):
        tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))
        cs = cosine_similarity(self.pos_m[tag_indices], self.u)
        cs_ind = np.argsort(-np.sum(cs, axis=0))
        orders = [cs_ind]
        for i in range(len(tags)):
            orders.append(np.argsort(-cs[i, :]))
            #print(np.argsort(-np.sum(cs[i, :], axis=1)))
        output = []
        for i in range(len(cs_ind)):
            for j in range(len(orders)):
                if orders[j][i] not in output:
                    output.append(orders[j][i])
        return self.rm.server_names[output]
        
score_rand = ReccRandom(rm).score(test_data)
scores_svd50 = ReccSVD50(rm).score(test_data)
scores_alt = ReccAlternate(rm).score(test_data)

In [None]:
class ReccIDF(ReccSVD50):
    def __init__(self, rm):
        super().__init__(rm)
        self.idf = {}
        for row in rm.tfidf.select(["tags", "idf"]).unique().iter_rows(named=True):
            self.idf[row["tags"]] = row["idf"]

    def recommend(self, tags: list[str]):
        idf_vec = np.array([self.idf[tag] for tag in tags])
        tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))
        cs = cosine_similarity(self.pos_m[tag_indices], self.u)
        cs_ind = np.argsort(-np.sum(np.multiply(cs, idf_vec[:, np.newaxis]), axis=0))
        return self.rm.server_names[cs_ind]

scores_idf = ReccIDF(rm).score(test_data)

In [None]:
np.mean(scores_idf)

In [None]:
np.mean(scores_svd50)

In [None]:
tdf = test_data.with_columns(
    rand = pl.Series(score_rand),
    svd = pl.Series(scores_svd50),
    alt = pl.Series(scores_alt),
    idf = pl.Series(scores_idf)
)
tdf.write_ipc("../data/scratch/svd50_eval.feather")

## Moved Accounts

Can we predict moved accounts?

In [None]:
moved_accounts = pl.read_ipc("../data/scratch/individual_moved_accounts.feather")
maccount_tag_posts = moved_accounts.join(
    pl.scan_ipc("../data/scratch/all_tag_posts.feather").rename({
        "acct": "account",
        "host": "server"
    }).collect(),
    on=["account", "server"],
    how='inner'
).explode("tags").filter(
    pl.col("tags").is_in(tags)
).filter(
    pl.col("moved_server").is_in(hosts)
).unique(
    ["moved_server", "moved_acct", "tags"]
).select(
    ["moved_server", "moved_acct", "tags"]
).group_by(
    ["moved_server", "moved_acct"]
).agg([
    pl.col("tags")#.list()
]).with_columns(
    pl.col("tags").list.len().alias("n_tags")
).filter(pl.col("n_tags") >= 2).filter(
    pl.col("n_tags") <= 1000
).sort("n_tags", descending=True).rename({
    "moved_server": "host"
})
maccount_tag_posts

In [None]:
score_rand_moved = ReccRandom(rm).score(maccount_tag_posts)
scores_svd50_moved = ReccSVD50(rm).score(maccount_tag_posts)
scores_alt_moved = ReccAlternate(rm).score(maccount_tag_posts)
scores_idf_moved = ReccIDF(rm).score(maccount_tag_posts)

In [None]:
np.mean(score_rand_moved)

In [None]:
np.mean(scores_svd50_moved)

In [None]:
np.median(scores_svd50_moved)

In [None]:
np.value_counts(scores_svd50_moved)

In [None]:
np.unique(scores_svd50_moved, return_counts=True)

In [None]:
len(scores_svd50_moved)

In [None]:
np.sum(np.array(scores_svd50_moved) < 10)