junior-sheer/codebase/evaluation.ipynb
Carl Colglazier 60023b07d1 Refactor scripts and code.
Split manuscripts into their own directories / projects.
2025-05-26 20:08:57 -05:00

338 lines
9.0 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Evaluation of the Recommender System"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"from pathlib import Path\n",
"import polars as pl\n",
"sys.path.append('recc/')\n",
"from recc import build_suggestion\n",
"import numpy as np\n",
"from sklearn.metrics.pairwise import cosine_similarity"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rm = build_suggestion.ReccModel(\"../data/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"rm.tfidf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tags = rm.tfidf.select(pl.col(\"tags\")).unique()\n",
"hosts = rm.tfidf.select(pl.col(\"host\")).unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_data = pl.scan_ipc(\"../data/scratch/all_tag_posts.feather\").filter(\n",
" pl.col(\"created_at\") >= pl.date(2023, 8, 1)\n",
").filter(\n",
" pl.col(\"created_at\") <= pl.date(2023, 8, 14)\n",
").explode(\"tags\").filter(\n",
" pl.col(\"tags\").is_in(tags)\n",
").filter(\n",
" pl.col(\"host\").is_in(hosts)\n",
").unique(\n",
" [\"host\", \"acct\", \"tags\"]\n",
").select(\n",
" [\"host\", \"acct\", \"tags\"]\n",
").group_by(\n",
" [\"host\", \"acct\"]\n",
").agg([\n",
" pl.col(\"tags\")#.list()\n",
"]).with_columns(\n",
" pl.col(\"tags\").list.len().alias(\"n_tags\")\n",
").collect().filter(pl.col(\"n_tags\") >= 2).filter(\n",
" pl.col(\"n_tags\") <= 25\n",
").sort(\"n_tags\", descending=True)\n",
"test_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class ReccSVD50:\n",
" def __init__(self, rm):\n",
" self.rm = rm\n",
" self.u, self.s, self.v = rm.svd(k=50, norm_axis=1)\n",
" self.pos_m = self.v.T @ np.diag(self.s)\n",
"\n",
" def recommend(self, tags: list[str]):\n",
" tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n",
" cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n",
" cs_ind = np.argsort(-np.sum(cs, axis=0))\n",
" return self.rm.server_names[cs_ind]\n",
" \n",
" def score(self, test_data):\n",
" ranks = []\n",
" for row in test_data.iter_rows(named=True):\n",
" tags = row[\"tags\"]\n",
" recc = self.recommend(tags)\n",
" rank = np.where(recc == row[\"host\"])[0][0]\n",
" ranks.append(rank)\n",
" return ranks\n",
" \n",
"class ReccRandom(ReccSVD50):\n",
" def __init__(self, rm):\n",
" self.rm = rm\n",
"\n",
" def recommend(self, tags: list[str]):\n",
" return np.random.permutation(self.rm.server_names)\n",
" \n",
"class ReccAlternate(ReccSVD50):\n",
" def recommend(self, tags: list[str]):\n",
" tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n",
" cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n",
" cs_ind = np.argsort(-np.sum(cs, axis=0))\n",
" orders = [cs_ind]\n",
" for i in range(len(tags)):\n",
" orders.append(np.argsort(-cs[i, :]))\n",
" #print(np.argsort(-np.sum(cs[i, :], axis=1)))\n",
" output = []\n",
" for i in range(len(cs_ind)):\n",
" for j in range(len(orders)):\n",
" if orders[j][i] not in output:\n",
" output.append(orders[j][i])\n",
" return self.rm.server_names[output]\n",
" \n",
"score_rand = ReccRandom(rm).score(test_data)\n",
"scores_svd50 = ReccSVD50(rm).score(test_data)\n",
"scores_alt = ReccAlternate(rm).score(test_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class ReccIDF(ReccSVD50):\n",
" def __init__(self, rm):\n",
" super().__init__(rm)\n",
" self.idf = {}\n",
" for row in rm.tfidf.select([\"tags\", \"idf\"]).unique().iter_rows(named=True):\n",
" self.idf[row[\"tags\"]] = row[\"idf\"]\n",
"\n",
" def recommend(self, tags: list[str]):\n",
" idf_vec = np.array([self.idf[tag] for tag in tags])\n",
" tag_indices = list(map(lambda x: self.rm.tag_names.tolist().index(x), tags))\n",
" cs = cosine_similarity(self.pos_m[tag_indices], self.u)\n",
" cs_ind = np.argsort(-np.sum(np.multiply(cs, idf_vec[:, np.newaxis]), axis=0))\n",
" return self.rm.server_names[cs_ind]\n",
"\n",
"scores_idf = ReccIDF(rm).score(test_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.mean(scores_idf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.mean(scores_svd50)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tdf = test_data.with_columns(\n",
" rand = pl.Series(score_rand),\n",
" svd = pl.Series(scores_svd50),\n",
" alt = pl.Series(scores_alt),\n",
" idf = pl.Series(scores_idf)\n",
")\n",
"tdf.write_ipc(\"../data/scratch/svd50_eval.feather\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Moved Accounts\n",
"\n",
"Can we predict moved accounts?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"moved_accounts = pl.read_ipc(\"../data/scratch/individual_moved_accounts.feather\")\n",
"maccount_tag_posts = moved_accounts.join(\n",
" pl.scan_ipc(\"../data/scratch/all_tag_posts.feather\").rename({\n",
" \"acct\": \"account\",\n",
" \"host\": \"server\"\n",
" }).collect(),\n",
" on=[\"account\", \"server\"],\n",
" how='inner'\n",
").explode(\"tags\").filter(\n",
" pl.col(\"tags\").is_in(tags)\n",
").filter(\n",
" pl.col(\"moved_server\").is_in(hosts)\n",
").unique(\n",
" [\"moved_server\", \"moved_acct\", \"tags\"]\n",
").select(\n",
" [\"moved_server\", \"moved_acct\", \"tags\"]\n",
").group_by(\n",
" [\"moved_server\", \"moved_acct\"]\n",
").agg([\n",
" pl.col(\"tags\")#.list()\n",
"]).with_columns(\n",
" pl.col(\"tags\").list.len().alias(\"n_tags\")\n",
").filter(pl.col(\"n_tags\") >= 2).filter(\n",
" pl.col(\"n_tags\") <= 1000\n",
").sort(\"n_tags\", descending=True).rename({\n",
" \"moved_server\": \"host\"\n",
"})\n",
"maccount_tag_posts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"score_rand_moved = ReccRandom(rm).score(maccount_tag_posts)\n",
"scores_svd50_moved = ReccSVD50(rm).score(maccount_tag_posts)\n",
"scores_alt_moved = ReccAlternate(rm).score(maccount_tag_posts)\n",
"scores_idf_moved = ReccIDF(rm).score(maccount_tag_posts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.mean(score_rand_moved)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.mean(scores_svd50_moved)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.median(scores_svd50_moved)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.value_counts(scores_svd50_moved)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.unique(scores_svd50_moved, return_counts=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(scores_svd50_moved)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"np.sum(np.array(scores_svd50_moved) < 10)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "renv-python-3.9",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.22"
}
},
"nbformat": 4,
"nbformat_minor": 2
}