39 lines
1.8 KiB
Python
39 lines
1.8 KiB
Python
import pandas as pd
|
|
import plotnine as pn
|
|
from pathlib import Path
|
|
from clustering.fit_tsne import fit_tsne
|
|
from visualization.tsne_vis import build_visualization
|
|
|
|
df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
|
|
|
|
# plot silhouette_score as a function of isolates
|
|
df = df.sort_values("silhouette_score")
|
|
|
|
df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
|
|
p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
|
|
p.save("isolates_x_score.png")
|
|
|
|
p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
|
|
p.save("clusters_x_isolates.png")
|
|
|
|
# the best result for hdbscan seems like this one: it has a decent number of
|
|
# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
|
|
best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
|
|
|
|
best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
|
|
|
|
tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
|
|
|
|
if not tnse_data.exists():
|
|
fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
|
|
tnse_data)
|
|
|
|
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
|
|
Path(best_eom.outpath)/(best_eom['name']+'.feather'),
|
|
"./authors-tf_lsi850_best_eom.html")
|
|
|
|
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
|
|
Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
|
|
"./authors-tf_lsi850_best_leaf.html")
|
|
|