28 lines
1.2 KiB
Python
28 lines
1.2 KiB
Python
# Generates a list of similar servers
|
|
import polars as pl
|
|
import scipy
|
|
import numpy as np
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
from FediRecommender import TagData, built_tfidf_matrix
|
|
|
|
server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
|
|
|
|
# Create data object
|
|
td = TagData(servers=server_samples, n_tags=256, min_server_accounts=2, data_dir='data')
|
|
# Normalize data using BM
|
|
tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)#.filter(pl.col("accounts") / pl.col("D") > 0.0001)
|
|
baseline_host_to_index = td.host_to_index
|
|
full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T
|
|
m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one
|
|
baseline_similarlity = cosine_similarity(m)
|
|
l = []
|
|
for i in range(np.shape(baseline_similarlity)[0] - 1):
|
|
l.append(
|
|
pl.DataFrame({
|
|
"Source": list(td.host_to_index.keys())[i],
|
|
"Target": list(td.host_to_index.keys())[i+1:],
|
|
"Similarity": baseline_similarlity[i][i+1:]
|
|
})
|
|
)
|
|
similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0)
|
|
similarity_df.write_ipc("data/scratch/server_similarity.feather") |