{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluation of the Recommender System" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "from pathlib import Path\n", "import polars as pl\n", "sys.path.append('recc/')\n", "from recc import build_suggestion\n", "import numpy as np\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rm = build_suggestion.ReccModel(\"../data/\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rm.tfidf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tags = rm.tfidf.select(pl.col(\"tags\")).unique()\n", "hosts = rm.tfidf.select(pl.col(\"host\")).unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_data = pl.scan_ipc(\"../data/scratch/all_tag_posts.feather\").filter(\n", " pl.col(\"created_at\") >= pl.date(2023, 8, 1)\n", ").filter(\n", " pl.col(\"created_at\") <= pl.date(2023, 8, 14)\n", ").explode(\"tags\").filter(\n", " pl.col(\"tags\").is_in(tags)\n", ").filter(\n", " pl.col(\"host\").is_in(hosts)\n", ").unique(\n", " [\"host\", \"acct\", \"tags\"]\n", ").select(\n", " [\"host\", \"acct\", \"tags\"]\n", ").group_by(\n", " [\"host\", \"acct\"]\n", ").agg([\n", " pl.col(\"tags\")#.list()\n", "]).with_columns(\n", " pl.col(\"tags\").list.len().alias(\"n_tags\")\n", ").collect().filter(pl.col(\"n_tags\") >= 2).filter(\n", " pl.col(\"n_tags\") <= 25\n", ").sort(\"n_tags\", descending=True)\n", "test_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class ReccSVD50:\n", " def __init__(self, rm):\n", " self.rm = rm\n", " self.u, self.s, self.v = rm.svd(k=50, norm_axis=1)\n", " self.pos_m = self.v.T @ np.diag(self.s)\n", "\n", " def recommend(self, tags: list[str]):\n", " tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n", " cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n", " cs_ind = np.argsort(-np.sum(cs, axis=0))\n", " return self.rm.server_names[cs_ind]\n", " \n", " def score(self, test_data):\n", " ranks = []\n", " for row in test_data.iter_rows(named=True):\n", " tags = row[\"tags\"]\n", " recc = self.recommend(tags)\n", " rank = np.where(recc == row[\"host\"])[0][0]\n", " ranks.append(rank)\n", " return ranks\n", " \n", "class ReccRandom(ReccSVD50):\n", " def __init__(self, rm):\n", " self.rm = rm\n", "\n", " def recommend(self, tags: list[str]):\n", " return np.random.permutation(self.rm.server_names)\n", " \n", "class ReccAlternate(ReccSVD50):\n", " def recommend(self, tags: list[str]):\n", " tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n", " cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n", " cs_ind = np.argsort(-np.sum(cs, axis=0))\n", " orders = [cs_ind]\n", " for i in range(len(tags)):\n", " orders.append(np.argsort(-cs[i, :]))\n", " #print(np.argsort(-np.sum(cs[i, :], axis=1)))\n", " output = []\n", " for i in range(len(cs_ind)):\n", " for j in range(len(orders)):\n", " if orders[j][i] not in output:\n", " output.append(orders[j][i])\n", " return self.rm.server_names[output]\n", " \n", "score_rand = ReccRandom(rm).score(test_data)\n", "scores_svd50 = ReccSVD50(rm).score(test_data)\n", "scores_alt = ReccAlternate(rm).score(test_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class ReccIDF(ReccSVD50):\n", " def __init__(self, rm):\n", " super().__init__(rm)\n", " self.idf = {}\n", " for row in rm.tfidf.select([\"tags\", \"idf\"]).unique().iter_rows(named=True):\n", " self.idf[row[\"tags\"]] = row[\"idf\"]\n", "\n", " def recommend(self, tags: list[str]):\n", " idf_vec = np.array([self.idf[tag] for tag in tags])\n", " tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n", " cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n", " cs_ind = np.argsort(-np.sum(np.multiply(cs, idf_vec[:, np.newaxis]), axis=0))\n", " return self.rm.server_names[cs_ind]\n", "\n", "scores_idf = ReccIDF(rm).score(test_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(scores_idf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(scores_svd50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tdf = test_data.with_columns(\n", " rand = pl.Series(score_rand),\n", " svd = pl.Series(scores_svd50),\n", " alt = pl.Series(scores_alt),\n", " idf = pl.Series(scores_idf)\n", ")\n", "tdf.write_ipc(\"../data/scratch/svd50_eval.feather\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Moved Accounts\n", "\n", "Can we predict moved accounts?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "moved_accounts = pl.read_ipc(\"../data/scratch/individual_moved_accounts.feather\")\n", "maccount_tag_posts = moved_accounts.join(\n", " pl.scan_ipc(\"../data/scratch/all_tag_posts.feather\").rename({\n", " \"acct\": \"account\",\n", " \"host\": \"server\"\n", " }).collect(),\n", " on=[\"account\", \"server\"],\n", " how='inner'\n", ").explode(\"tags\").filter(\n", " pl.col(\"tags\").is_in(tags)\n", ").filter(\n", " pl.col(\"moved_server\").is_in(hosts)\n", ").unique(\n", " [\"moved_server\", \"moved_acct\", \"tags\"]\n", ").select(\n", " [\"moved_server\", \"moved_acct\", \"tags\"]\n", ").group_by(\n", " [\"moved_server\", \"moved_acct\"]\n", ").agg([\n", " pl.col(\"tags\")#.list()\n", "]).with_columns(\n", " pl.col(\"tags\").list.len().alias(\"n_tags\")\n", ").filter(pl.col(\"n_tags\") >= 2).filter(\n", " pl.col(\"n_tags\") <= 1000\n", ").sort(\"n_tags\", descending=True).rename({\n", " \"moved_server\": \"host\"\n", "})\n", "maccount_tag_posts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "score_rand_moved = ReccRandom(rm).score(maccount_tag_posts)\n", "scores_svd50_moved = ReccSVD50(rm).score(maccount_tag_posts)\n", "scores_alt_moved = ReccAlternate(rm).score(maccount_tag_posts)\n", "scores_idf_moved = ReccIDF(rm).score(maccount_tag_posts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(score_rand_moved)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.mean(scores_svd50_moved)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.median(scores_svd50_moved)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.value_counts(scores_svd50_moved)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.unique(scores_svd50_moved, return_counts=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(scores_svd50_moved)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.sum(np.array(scores_svd50_moved) < 10)" ] } ], "metadata": { "kernelspec": { "display_name": "renv-python-3.9", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.22" } }, "nbformat": 4, "nbformat_minor": 2 }