75 lines
3.4 KiB
Python
75 lines
3.4 KiB
Python
jm = pl.read_json("data/joinmastodon-2023-08-25.json")
|
|
jm_servers = set(jm["domain"].unique().to_list())
|
|
|
|
jm_td = td = TagData(jm_servers, 256, min_server_accounts=2)
|
|
jm_tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)#.filter(pl.col("accounts") / pl.col("D") > 0.0001)
|
|
|
|
mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index)
|
|
m = (mat.T / scipy.sparse.linalg.norm(mat.T, ord=2, axis=0)) # good one
|
|
server_similarlity = cosine_similarity(m.tocsr())
|
|
|
|
# Export server similarity
|
|
tag_sm = cosine_similarity(mat.tocsr())
|
|
tag_index_included = (np.sum(tag_sm, axis=0) > 0)
|
|
included_tag_strings = np.array(list(jm_td.tag_to_index.keys()))[tag_index_included]
|
|
tag_sm_matrix = tag_sm[np.ix_(tag_index_included, tag_index_included)]
|
|
# import Affinity Prop
|
|
from sklearn.cluster import AffinityPropagation
|
|
ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm_matrix)
|
|
clusters = pl.DataFrame({"tag": included_tag_strings, "cluster": ap.labels_})
|
|
# select a random element from each cluster
|
|
clusters.group_by("cluster").agg([pl.col("tag").shuffle().first().alias("tag")]).sort("cluster")["tag"].to_list()
|
|
|
|
example_topics = ["tech", "linux", "hacking", "gamedev"]
|
|
example_indices = [s in example_topics for s in included_tag_strings]
|
|
similar_servers = cosine_similarity(np.array(example_indices).reshape(-1,1).T, mat[np.ix_(tag_index_included)].T)
|
|
np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similar_servers[0])][0:10]
|
|
|
|
#np.array(list(jm_td.host_to_index.keys()))[np.argsort(-similarlity[jm_td.host_to_index["historians.social"]])][0:10]
|
|
#np.array(list(jm_td.host_to_index.keys()))[np.where(np.sum(mat, axis=0) < 0.01)[1]]
|
|
|
|
server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
|
|
|
|
td = TagData(server_samples, 256, min_server_accounts=2)
|
|
tfidf = td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)#.filter(pl.col("accounts") / pl.col("D") > 0.0001)
|
|
baseline_host_to_index = td.host_to_index
|
|
full_mat = built_tfidf_matrix(tfidf, td.tag_to_index, td.host_to_index).T
|
|
#m = (full_mat.T / scipy.sparse.linalg.norm(full_mat.T, ord=2, axis=0)).T
|
|
|
|
m = (full_mat / scipy.sparse.linalg.norm(full_mat, ord=2, axis=0)) # good one
|
|
|
|
baseline_similarlity = cosine_similarity(m)
|
|
l = []
|
|
for i in range(np.shape(baseline_similarlity)[0] - 1):
|
|
#s_index = min(i, np.shape(baseline_similarlity)[0] - 1)
|
|
l.append(
|
|
pl.DataFrame({
|
|
"Source": list(td.host_to_index.keys())[i],
|
|
"Target": list(td.host_to_index.keys())[i+1:],
|
|
"Similarity": baseline_similarlity[i][i+1:]
|
|
})
|
|
)
|
|
|
|
similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0)
|
|
similarity_df.write_ipc("data/scratch/server_similarity.feather")
|
|
|
|
server = "hci.social"
|
|
similarity_df.filter((pl.col("Source") == server) | (pl.col("Target") == server)).sort("Similarity", descending=True)[0:10]
|
|
tfidf.filter(pl.col("host") == server)[0:10]
|
|
|
|
tfidf.filter(pl.col("tags") == "aoir2023")
|
|
|
|
m = (full_mat.T / scipy.sparse.linalg.norm(full_mat.T, ord=2, axis=0)) # good one
|
|
tag_similarity = cosine_similarity(m)
|
|
l = []
|
|
for i in range(np.shape(tag_similarity)[0] - 1):
|
|
#s_index = min(i, np.shape(baseline_similarlity)[0] - 1)
|
|
l.append(
|
|
pl.DataFrame({
|
|
"Source": list(td.tag_to_index.keys())[i],
|
|
"Target": list(td.tag_to_index.keys())[i+1:],
|
|
"Similarity": tag_similarity[i][i+1:]
|
|
})
|
|
)
|
|
|
|
tag_similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0) |