303 lines
13 KiB
Python
303 lines
13 KiB
Python
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
|
|
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
|
|
from dataclasses import dataclass
|
|
import hdbscan
|
|
from sklearn.neighbors import NearestNeighbors
|
|
import plotnine as pn
|
|
import numpy as np
|
|
from itertools import product, starmap, chain
|
|
import pandas as pd
|
|
from sklearn.metrics import silhouette_score, silhouette_samples
|
|
from pathlib import Path
|
|
from multiprocessing import Pool, cpu_count
|
|
import fire
|
|
from pyarrow.feather import write_feather
|
|
|
|
def test_select_hdbscan_clustering():
|
|
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
|
|
# "test_hdbscan_author30k",
|
|
# min_cluster_sizes=[2],
|
|
# min_samples=[1,2],
|
|
# cluster_selection_epsilons=[0,0.05,0.1,0.15],
|
|
# cluster_selection_methods=['eom','leaf'],
|
|
# lsi_dimensions='all')
|
|
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
|
|
outpath = "test_hdbscan";
|
|
min_cluster_sizes=[2,3,4];
|
|
min_samples=[1,2,3];
|
|
cluster_selection_epsilons=[0,0.1,0.3,0.5];
|
|
cluster_selection_methods=['eom'];
|
|
lsi_dimensions='all'
|
|
gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
|
|
gs.run(20)
|
|
gs.save("test_hdbscan/lsi_sweep.csv")
|
|
# job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
|
|
# job1.run()
|
|
# print(job1.get_info())
|
|
|
|
# df = pd.read_csv("test_hdbscan/selection_data.csv")
|
|
# test_select_hdbscan_clustering()
|
|
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
|
|
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
|
|
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
|
|
|
|
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
|
|
def __init__(self,
|
|
inpath,
|
|
lsi_dims,
|
|
outpath,
|
|
min_cluster_sizes,
|
|
min_samples,
|
|
cluster_selection_epsilons,
|
|
cluster_selection_methods
|
|
):
|
|
|
|
super().__init__(hdbscan_lsi_job,
|
|
_hdbscan_lsi_grid_sweep,
|
|
inpath,
|
|
lsi_dims,
|
|
outpath,
|
|
min_cluster_sizes,
|
|
min_samples,
|
|
cluster_selection_epsilons,
|
|
cluster_selection_methods)
|
|
|
|
class hdbscan_grid_sweep(grid_sweep):
|
|
def __init__(self,
|
|
inpath,
|
|
outpath,
|
|
*args,
|
|
**kwargs):
|
|
|
|
super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)
|
|
|
|
def namer(self,
|
|
min_cluster_size,
|
|
min_samples,
|
|
cluster_selection_epsilon,
|
|
cluster_selection_method):
|
|
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
|
|
|
|
|
|
class _hdbscan_lsi_grid_sweep(grid_sweep):
|
|
def __init__(self,
|
|
inpath,
|
|
outpath,
|
|
lsi_dim,
|
|
*args,
|
|
**kwargs):
|
|
|
|
self.lsi_dim = lsi_dim
|
|
self.jobtype = hdbscan_lsi_job
|
|
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
|
|
|
|
|
def namer(self, *args, **kwargs):
|
|
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
|
|
s += f"_lsi-{self.lsi_dim}"
|
|
return s
|
|
|
|
@dataclass
|
|
class hdbscan_clustering_result(clustering_result):
|
|
min_cluster_size:int
|
|
min_samples:int
|
|
cluster_selection_epsilon:float
|
|
cluster_selection_method:str
|
|
|
|
@dataclass
|
|
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
|
|
pass
|
|
|
|
class hdbscan_job(clustering_job):
|
|
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
|
|
super().__init__(infile,
|
|
outpath,
|
|
name,
|
|
call=hdbscan_job._hdbscan_clustering,
|
|
min_cluster_size=min_cluster_size,
|
|
min_samples=min_samples,
|
|
cluster_selection_epsilon=cluster_selection_epsilon,
|
|
cluster_selection_method=cluster_selection_method
|
|
)
|
|
|
|
self.min_cluster_size = min_cluster_size
|
|
self.min_samples = min_samples
|
|
self.cluster_selection_epsilon = cluster_selection_epsilon
|
|
self.cluster_selection_method = cluster_selection_method
|
|
# self.mat = 1 - self.mat
|
|
|
|
def _hdbscan_clustering(mat, *args, **kwargs):
|
|
print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
|
|
print(mat)
|
|
clusterer = hdbscan.HDBSCAN(metric='precomputed',
|
|
core_dist_n_jobs=cpu_count(),
|
|
*args,
|
|
**kwargs,
|
|
)
|
|
|
|
clustering = clusterer.fit(mat.astype('double'))
|
|
|
|
return(clustering)
|
|
|
|
def get_info(self):
|
|
result = super().get_info()
|
|
self.result = hdbscan_clustering_result(**result.__dict__,
|
|
min_cluster_size=self.min_cluster_size,
|
|
min_samples=self.min_samples,
|
|
cluster_selection_epsilon=self.cluster_selection_epsilon,
|
|
cluster_selection_method=self.cluster_selection_method)
|
|
return self.result
|
|
|
|
class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
|
|
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
|
super().__init__(
|
|
infile,
|
|
outpath,
|
|
name,
|
|
*args,
|
|
**kwargs)
|
|
super().set_lsi_dims(lsi_dims)
|
|
|
|
def get_info(self):
|
|
partial_result = super().get_info()
|
|
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
|
|
lsi_dimensions=self.lsi_dims)
|
|
return self.result
|
|
|
|
# def select_hdbscan_clustering(inpath,
|
|
# outpath,
|
|
# outfile=None,
|
|
# min_cluster_sizes=[2],
|
|
# min_samples=[1],
|
|
# cluster_selection_epsilons=[0],
|
|
# cluster_selection_methods=['eom'],
|
|
# lsi_dimensions='all'
|
|
# ):
|
|
|
|
# inpath = Path(inpath)
|
|
# outpath = Path(outpath)
|
|
# outpath.mkdir(exist_ok=True, parents=True)
|
|
|
|
# if lsi_dimensions is None:
|
|
# lsi_paths = [inpath]
|
|
# elif lsi_dimensions == 'all':
|
|
# lsi_paths = list(inpath.glob("*"))
|
|
|
|
# else:
|
|
# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
|
|
|
|
# if lsi_dimensions is not None:
|
|
# lsi_nums = [p.stem for p in lsi_paths]
|
|
# else:
|
|
# lsi_nums = [None]
|
|
# grid = list(product(lsi_nums,
|
|
# min_cluster_sizes,
|
|
# min_samples,
|
|
# cluster_selection_epsilons,
|
|
# cluster_selection_methods))
|
|
|
|
# # fix the output file names
|
|
# names = list(map(lambda t:'_'.join(map(str,t)),grid))
|
|
|
|
# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
|
|
|
|
# with Pool(int(cpu_count()/4)) as pool:
|
|
# mods = starmap(hdbscan_clustering, grid)
|
|
|
|
# res = pd.DataFrame(mods)
|
|
# if outfile is None:
|
|
# outfile = outpath / "selection_data.csv"
|
|
|
|
# res.to_csv(outfile)
|
|
|
|
# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
|
|
# subreddits, mat = read_similarity_mat(similarities)
|
|
# mat = sim_to_dist(mat)
|
|
# clustering = _hdbscan_clustering(mat,
|
|
# min_cluster_size=min_cluster_size,
|
|
# min_samples=min_samples,
|
|
# cluster_selection_epsilon=cluster_selection_epsilon,
|
|
# cluster_selection_method=cluster_selection_method,
|
|
# metric='precomputed',
|
|
# core_dist_n_jobs=cpu_count()
|
|
# )
|
|
|
|
# cluster_data = process_clustering_result(clustering, subreddits)
|
|
# isolates = clustering.labels_ == -1
|
|
# scoremat = mat[~isolates][:,~isolates]
|
|
# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
|
|
# cluster_data.to_feather(output)
|
|
# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
|
|
# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
|
|
# silsampout = output.parent / ("silhouette_samples" + output.name)
|
|
# silhouette_samp.to_feather(silsampout)
|
|
|
|
# result = hdbscan_clustering_result(outpath=output,
|
|
# silhouette_samples=silsampout,
|
|
# silhouette_score=score,
|
|
# name=name,
|
|
# min_cluster_size=min_cluster_size,
|
|
# min_samples=min_samples,
|
|
# cluster_selection_epsilon=cluster_selection_epsilon,
|
|
# cluster_selection_method=cluster_selection_method,
|
|
# lsi_dimensions=lsi_dim,
|
|
# n_isolates=isolates.sum(),
|
|
# n_clusters=len(set(clustering.labels_))
|
|
# )
|
|
|
|
|
|
|
|
# return(result)
|
|
|
|
# # for all runs we should try cluster_selection_epsilon = None
|
|
# # for terms we should try cluster_selection_epsilon around 0.56-0.66
|
|
# # for authors we should try cluster_selection_epsilon around 0.98-0.99
|
|
# def _hdbscan_clustering(mat, *args, **kwargs):
|
|
# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
|
|
|
|
# print(mat)
|
|
# clusterer = hdbscan.HDBSCAN(*args,
|
|
# **kwargs,
|
|
# )
|
|
|
|
# clustering = clusterer.fit(mat.astype('double'))
|
|
|
|
# return(clustering)
|
|
|
|
def KNN_distances_plot(mat,outname,k=2):
|
|
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
|
|
distances, indices = nbrs.kneighbors(mat)
|
|
d2 = distances[:,-1]
|
|
df = pd.DataFrame({'dist':d2})
|
|
df = df.sort_values("dist",ascending=False)
|
|
df['idx'] = np.arange(0,d2.shape[0]) + 1
|
|
p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
|
|
breaks = np.arange(0,10)/10)
|
|
p.save(outname,width=16,height=10)
|
|
|
|
def make_KNN_plots():
|
|
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
|
|
subreddits, mat = read_similarity_mat(similarities)
|
|
mat = sim_to_dist(mat)
|
|
|
|
KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
|
|
|
|
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
|
|
subreddits, mat = read_similarity_mat(similarities)
|
|
mat = sim_to_dist(mat)
|
|
KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
|
|
|
|
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
|
|
subreddits, mat = read_similarity_mat(similarities)
|
|
mat = sim_to_dist(mat)
|
|
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
|
|
|
|
if __name__ == "__main__":
|
|
fire.Fire{'grid_sweep':hdbscan_grid_sweep,
|
|
'grid_sweep_lsi':hdbscan_lsi_grid_sweep
|
|
'cluster':hdbscan_job,
|
|
'cluster_lsi':hdbscan_lsi_job}
|
|
|
|
# test_select_hdbscan_clustering()
|
|
#fire.Fire(select_hdbscan_clustering)
|