149 lines
5.6 KiB
Python
149 lines
5.6 KiB
Python
|
|
if __name__ == '__main__':
|
|
jm = pl.read_json("data/joinmastodon-2023-08-25.json")
|
|
jm_servers = set(jm["domain"].unique().to_list())
|
|
jm_td = TagData(jm_servers, 256, min_server_accounts=2)
|
|
jm_tfidf = jm_td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)
|
|
mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index)
|
|
m = (mat.T / (scipy.sparse.linalg.norm(mat.T, ord=2, axis=0) + 0.0001))
|
|
server_similarlity = cosine_similarity(m.tocsr())
|
|
tag_use_counts = np.sum(mat > 0, axis=1).T
|
|
has_info = (tag_use_counts >= 0).tolist()[0]
|
|
tag_names = np.array(list(jm_td.tag_to_index.keys()))[has_info]
|
|
m_selected = m.tocsr()[:, has_info]
|
|
tag_sm = cosine_similarity(m_selected.T)
|
|
ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm)
|
|
clusters = pl.DataFrame({"tag": tag_names, "cluster": ap.labels_, "servers": tag_use_counts[[has_info]].tolist()[0]})
|
|
clusters.sort("servers", descending=True).unique("cluster").filter(pl.col("servers") >= 10)
|
|
|
|
|
|
from sklearn.decomposition import TruncatedSVD
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.preprocessing import Normalizer
|
|
|
|
lsa = make_pipeline(TruncatedSVD(n_components=10), Normalizer(copy=False))
|
|
|
|
X_lsa = lsa.fit_transform(m_selected)
|
|
|
|
pl.DataFrame({
|
|
"server": server_names,
|
|
#"x": svd.components_[0],
|
|
#"y": svd.components_[1],
|
|
#"z": svd.components_[2]
|
|
"x": X_lsa[:, 0],
|
|
"y": X_lsa[:, 1],
|
|
"z": X_lsa[:, 2],
|
|
}).write_ipc("data/scratch/server_svd.feather")
|
|
|
|
# Apply SVM to find the tags that provide the most information
|
|
X_lsa = lsa.fit_transform(m_selected.T)
|
|
pl.DataFrame({
|
|
"tag": tag_names,
|
|
"x": X_lsa[:, 0],
|
|
"y": X_lsa[:, 1],
|
|
"z": X_lsa[:, 2],
|
|
"variance": np.var(X_lsa, axis=1),
|
|
"count": tag_use_counts.tolist()[0]
|
|
}).write_ipc("data/scratch/tag_svd.feather")
|
|
|
|
#impot AgglomerativeClustering
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
ac = AgglomerativeClustering(n_clusters=None, distance_threshold=0.7, metric="l2", linkage="average").fit(m.tocsr()[:, has_info].T.toarray())
|
|
|
|
ac = AgglomerativeClustering(n_clusters=None, distance_threshold=0.01, metric="precomputed", linkage="average").fit(tag_sm)
|
|
clusters = pl.DataFrame({"tag": tag_names, "cluster": ac.labels_, "servers": tag_use_counts[[has_info]].tolist()[0]})
|
|
clusters.sort("servers", descending=True)[0:10]#.unique("cluster").filter(pl.col("servers") >= 10).sort("cluster")
|
|
|
|
clusters.sort("servers", descending=True).unique("cluster").filter(pl.col("servers") >= 10).sort("servers")
|
|
|
|
# Apply SVM to find the tags that provide the most information
|
|
has_info = (tag_use_counts >= 10).tolist()[0]
|
|
tag_names = np.array(list(jm_td.tag_to_index.keys()))[has_info]
|
|
m_selected = m.tocsr()[:, has_info]
|
|
U, S, VT = np.linalg.svd(m_selected.toarray(), full_matrices=False)
|
|
tag_names[np.argsort(-np.abs(np.sum(VT, axis=1)))]
|
|
|
|
np.linalg.norm(m_selected.toarray(), compute_uv=True)
|
|
|
|
|
|
tag_names[np.argsort(-np.abs(np.var(VT, axis=0)))]
|
|
|
|
mytags = ["eurovision2023", "lgbtq", "disney", "marvel"]
|
|
|
|
rank = 5
|
|
U_sub = U[:, :rank]
|
|
VT_sub = VT[:rank, :]
|
|
S_sub = np.diag(S[:rank])
|
|
A_low_rank = np.dot(np.dot(U_sub, S_sub), VT_sub)
|
|
Vk = VT[:rank, :]
|
|
|
|
norm(m_selected - A_low_rank)
|
|
|
|
|
|
tag_names[np.argsort(np.abs(Vk).sum(axis=0))[::-1]][0:25]
|
|
|
|
tag_names[np.argsort(Vk[0])[::-1]][0:5]
|
|
|
|
"""
|
|
l = []
|
|
for i in range(np.shape(tag_sm)[0] - 1):
|
|
l.append(
|
|
pl.DataFrame({
|
|
"Source": list(tag_names)[i],
|
|
"Target": list(tag_names)[i+1:],
|
|
"Similarity": tag_sm[i][i+1:]
|
|
})
|
|
)
|
|
|
|
similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0)
|
|
|
|
|
|
l = []
|
|
for i in range(np.shape(server_similarlity)[0] - 1):
|
|
#s_index = min(i, np.shape(baseline_similarlity)[0] - 1)
|
|
l.append(
|
|
pl.DataFrame({
|
|
"Source": list(jm_td.host_to_index.keys())[i],
|
|
"Target": list(jm_td.host_to_index.keys())[i+1:],
|
|
"Similarity": server_similarlity[i][i+1:]
|
|
})
|
|
)
|
|
similarity_df = pl.concat(l).filter(pl.col("Similarity") > 0.0)
|
|
|
|
|
|
jm = pl.read_json("data/joinmastodon-2023-08-25.json")
|
|
server_samples = set(pl.scan_ipc("data/scratch/all_tag_posts.feather").select("host").unique().collect().sample(fraction = 1.0)["host"].to_list())
|
|
jm_servers = set(jm["domain"].unique().to_list())
|
|
jm_td = TagData(server_samples, 256, min_server_accounts=2)
|
|
jm_tfidf = jm_td.bm(n_server_accounts=0, n_servers=2, n_accounts=10)
|
|
mat = built_tfidf_matrix(jm_tfidf, jm_td.tag_to_index, jm_td.host_to_index)
|
|
m = (mat.T / (scipy.sparse.linalg.norm(mat.T, ord=2, axis=0) + 0.0001))
|
|
server_similarlity = cosine_similarity(m.tocsr())
|
|
#has_info = np.array((np.sum(mat, axis=1).T > 0).tolist()[0])
|
|
tag_use_counts = np.sum(mat > 0, axis=1).T
|
|
has_info = (tag_use_counts >= 3).tolist()[0]
|
|
|
|
tag_names = np.array(list(jm_td.tag_to_index.keys()))[has_info]
|
|
m_selected = m.tocsr()[:, has_info]
|
|
|
|
tag_sm = cosine_similarity(m_selected.T)
|
|
|
|
ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm)
|
|
clusters = pl.DataFrame({"tag": tag_names, "cluster": ap.labels_, "servers": tag_use_counts[[has_info]].tolist()[0]})
|
|
|
|
|
|
tag_index_included = (np.sum(tag_sm, axis=0) > 0)
|
|
included_tag_strings = np.array(list(jm_td.tag_to_index.keys()))[tag_index_included]
|
|
tag_sm_matrix = tag_sm[np.ix_(tag_index_included, tag_index_included)]
|
|
# import Affinity Prop
|
|
from sklearn.cluster import AffinityPropagation
|
|
ap = AffinityPropagation(affinity="precomputed", random_state=0).fit(tag_sm_matrix)
|
|
clusters = pl.DataFrame({"tag": included_tag_strings, "cluster": ap.labels_})
|
|
# select a random element from each cluster
|
|
clusters.group_by("cluster").agg([pl.col("tag").shuffle().first().alias("tag")]).sort("cluster")["tag"].to_list()
|
|
|
|
clusters.group_by("cluster").agg([pl.col("tag").len().alias("count")]).sort("count", descending=True)
|
|
|
|
|
|
clusters.filter(pl.col("servers") >= 10)
|
|
""" |