cdsc_reddit/clustering/hdbscan_clustering.py

from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
from dataclasses import dataclass
import hdbscan
from sklearn.neighbors import NearestNeighbors
import plotnine as pn
import numpy as np
from itertools import product, starmap, chain
import pandas as pd
from sklearn.metrics import silhouette_score, silhouette_samples
from pathlib import Path
from multiprocessing import Pool, cpu_count
import fire
from pyarrow.feather import write_feather

def test_select_hdbscan_clustering():
    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
    #                           "test_hdbscan_author30k",
    #                           min_cluster_sizes=[2],
    #                           min_samples=[1,2],
    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
    #                           cluster_selection_methods=['eom','leaf'],
    #                           lsi_dimensions='all')
    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
    outpath = "test_hdbscan";
    min_cluster_sizes=[2,3,4];
    min_samples=[1,2,3];
    cluster_selection_epsilons=[0,0.1,0.3,0.5];
    cluster_selection_methods=['eom'];
    lsi_dimensions='all'
    gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
    gs.run(20)
    gs.save("test_hdbscan/lsi_sweep.csv")
    # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
    # job1.run()
    # print(job1.get_info())

    # df = pd.read_csv("test_hdbscan/selection_data.csv")
    # test_select_hdbscan_clustering()
    # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
    # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
    # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)

class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 min_cluster_sizes,
                 min_samples,
                 cluster_selection_epsilons,
                 cluster_selection_methods
                 ):

        super().__init__(hdbscan_lsi_job,
                         _hdbscan_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         min_cluster_sizes,
                         min_samples,
                         cluster_selection_epsilons,
                         cluster_selection_methods)
        
class hdbscan_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):

        super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)

    def namer(self,
              min_cluster_size,
              min_samples,
              cluster_selection_epsilon,
              cluster_selection_method):
        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"


class _hdbscan_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):

        self.lsi_dim = lsi_dim
        self.jobtype = hdbscan_lsi_job
        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)


    def namer(self, *args, **kwargs):
        s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s

@dataclass
class hdbscan_clustering_result(clustering_result):
    min_cluster_size:int
    min_samples:int
    cluster_selection_epsilon:float
    cluster_selection_method:str

@dataclass
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
    pass 

class hdbscan_job(clustering_job):
    def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
        super().__init__(infile,
                         outpath,
                         name,
                         call=hdbscan_job._hdbscan_clustering,
                         min_cluster_size=min_cluster_size,
                         min_samples=min_samples,
                         cluster_selection_epsilon=cluster_selection_epsilon,
                         cluster_selection_method=cluster_selection_method
                         )

        self.min_cluster_size = min_cluster_size
        self.min_samples = min_samples
        self.cluster_selection_epsilon = cluster_selection_epsilon
        self.cluster_selection_method = cluster_selection_method
#        self.mat = 1 - self.mat

    def _hdbscan_clustering(mat, *args, **kwargs):
        print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
        print(mat)
        clusterer = hdbscan.HDBSCAN(metric='precomputed',
                                    core_dist_n_jobs=cpu_count(),
                                    *args,
                                    **kwargs,
                                    )
    
        clustering = clusterer.fit(mat.astype('double'))
    
        return(clustering)

    def get_info(self):
        result = super().get_info()
        self.result = hdbscan_clustering_result(**result.__dict__,
                                                min_cluster_size=self.min_cluster_size,
                                                min_samples=self.min_samples,
                                                cluster_selection_epsilon=self.cluster_selection_epsilon,
                                                cluster_selection_method=self.cluster_selection_method)
        return self.result

class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(
                         infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)

    def get_info(self):
        partial_result = super().get_info()
        self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
                                                    lsi_dimensions=self.lsi_dims)
        return self.result

# def select_hdbscan_clustering(inpath,
#                               outpath,
#                               outfile=None,
#                               min_cluster_sizes=[2],
#                               min_samples=[1],
#                               cluster_selection_epsilons=[0],
#                               cluster_selection_methods=['eom'],
#                               lsi_dimensions='all'
#                               ):

#     inpath = Path(inpath)
#     outpath = Path(outpath)
#     outpath.mkdir(exist_ok=True, parents=True)
    
#     if lsi_dimensions is None:
#         lsi_paths = [inpath]
#     elif lsi_dimensions == 'all':
#         lsi_paths = list(inpath.glob("*"))

#     else:
#         lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]

#     if lsi_dimensions is not None:
#         lsi_nums = [p.stem for p in lsi_paths]
#     else:
#         lsi_nums = [None]
#     grid = list(product(lsi_nums,
#                         min_cluster_sizes,
#                         min_samples,
#                         cluster_selection_epsilons,
#                         cluster_selection_methods))

#     # fix the output file names
#     names = list(map(lambda t:'_'.join(map(str,t)),grid))

#     grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
        
#     with Pool(int(cpu_count()/4)) as pool:
#         mods = starmap(hdbscan_clustering, grid)

#     res = pd.DataFrame(mods)
#     if outfile is None:
#         outfile = outpath / "selection_data.csv"

#     res.to_csv(outfile)

# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
#     subreddits, mat = read_similarity_mat(similarities)
#     mat = sim_to_dist(mat)
#     clustering = _hdbscan_clustering(mat,
#                                      min_cluster_size=min_cluster_size,
#                                      min_samples=min_samples,
#                                      cluster_selection_epsilon=cluster_selection_epsilon,
#                                      cluster_selection_method=cluster_selection_method,
#                                      metric='precomputed',
#                                      core_dist_n_jobs=cpu_count()
#                                      )

#     cluster_data = process_clustering_result(clustering, subreddits)
#     isolates = clustering.labels_ == -1
#     scoremat = mat[~isolates][:,~isolates]
#     score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
#     cluster_data.to_feather(output)
#     silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
#     silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
#     silsampout = output.parent / ("silhouette_samples" + output.name)
#     silhouette_samp.to_feather(silsampout)

#     result = hdbscan_clustering_result(outpath=output,
#                                        silhouette_samples=silsampout,
#                                        silhouette_score=score,
#                                        name=name,
#                                        min_cluster_size=min_cluster_size,
#                                        min_samples=min_samples,
#                                        cluster_selection_epsilon=cluster_selection_epsilon,
#                                        cluster_selection_method=cluster_selection_method,
#                                        lsi_dimensions=lsi_dim,
#                                        n_isolates=isolates.sum(),
#                                        n_clusters=len(set(clustering.labels_))
#                                    )


#     return(result)

# # for all runs we should try cluster_selection_epsilon = None
# # for terms we should try cluster_selection_epsilon around 0.56-0.66
# # for authors we should try cluster_selection_epsilon around 0.98-0.99
# def _hdbscan_clustering(mat, *args, **kwargs):
#     print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")

#     print(mat)
#     clusterer = hdbscan.HDBSCAN(*args,
#                                 **kwargs,
#                                 )
    
#     clustering = clusterer.fit(mat.astype('double'))
    
#     return(clustering)

def KNN_distances_plot(mat,outname,k=2):
    nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
    distances, indices = nbrs.kneighbors(mat)
    d2 = distances[:,-1]
    df = pd.DataFrame({'dist':d2})
    df = df.sort_values("dist",ascending=False)
    df['idx'] = np.arange(0,d2.shape[0]) + 1
    p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
                                                                                      breaks = np.arange(0,10)/10)
    p.save(outname,width=16,height=10)
    
def make_KNN_plots():
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)

    KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')

    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')

    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')

if __name__ == "__main__":
    fire.Fire{'grid_sweep':hdbscan_grid_sweep,
              'grid_sweep_lsi':hdbscan_lsi_grid_sweep
              'cluster':hdbscan_job,
              'cluster_lsi':hdbscan_lsi_job}
    
#    test_select_hdbscan_clustering()
    #fire.Fire(select_hdbscan_clustering)
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00			`from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat`
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00			`from dataclasses import dataclass`
			`import hdbscan`
			`from sklearn.neighbors import NearestNeighbors`
			`import plotnine as pn`
			`import numpy as np`
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`from itertools import product, starmap, chain`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00			`import pandas as pd`
			`from sklearn.metrics import silhouette_score, silhouette_samples`
			`from pathlib import Path`
			`from multiprocessing import Pool, cpu_count`
			`import fire`
			`from pyarrow.feather import write_feather`

			`def test_select_hdbscan_clustering():`
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",`
			`# "test_hdbscan_author30k",`
			`# min_cluster_sizes=[2],`
			`# min_samples=[1,2],`
			`# cluster_selection_epsilons=[0,0.05,0.1,0.15],`
			`# cluster_selection_methods=['eom','leaf'],`
			`# lsi_dimensions='all')`
			`inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00			`outpath = "test_hdbscan";`
			`min_cluster_sizes=[2,3,4];`
			`min_samples=[1,2,3];`
			`cluster_selection_epsilons=[0,0.1,0.3,0.5];`
			`cluster_selection_methods=['eom'];`
			`lsi_dimensions='all'`
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)`
			`gs.run(20)`
			`gs.save("test_hdbscan/lsi_sweep.csv")`
			`# job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')`
			`# job1.run()`
			`# print(job1.get_info())`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# df = pd.read_csv("test_hdbscan/selection_data.csv")`
			`# test_select_hdbscan_clustering()`
			`# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")`
			`# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")`
			`# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)`
refactor clustering.py into method-specific files. 2021-05-03 18:28:48 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`class hdbscan_lsi_grid_sweep(lsi_grid_sweep):`
			`def __init__(self,`
			`inpath,`
			`lsi_dims,`
			`outpath,`
			`min_cluster_sizes,`
			`min_samples,`
			`cluster_selection_epsilons,`
			`cluster_selection_methods`
			`):`

			`super().__init__(hdbscan_lsi_job,`
			`_hdbscan_lsi_grid_sweep,`
			`inpath,`
			`lsi_dims,`
			`outpath,`
			`min_cluster_sizes,`
			`min_samples,`
			`cluster_selection_epsilons,`
			`cluster_selection_methods)`

			`class hdbscan_grid_sweep(grid_sweep):`
			`def __init__(self,`
			`inpath,`
			`outpath,`
			`*args,`
			`**kwargs):`

			`super().__init__(hdbscan_job, inpath, outpath, self.namer, args, *kwargs)`

			`def namer(self,`
			`min_cluster_size,`
			`min_samples,`
			`cluster_selection_epsilon,`
			`cluster_selection_method):`
			`return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"`


			`class _hdbscan_lsi_grid_sweep(grid_sweep):`
			`def __init__(self,`
			`inpath,`
			`outpath,`
			`lsi_dim,`
			`*args,`
			`**kwargs):`

			`self.lsi_dim = lsi_dim`
			`self.jobtype = hdbscan_lsi_job`
			`super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, args, *kwargs)`


			`def namer(self, args, *kwargs):`
			`s = hdbscan_grid_sweep.namer(self, args[1:], *kwargs)`
			`s += f"_lsi-{self.lsi_dim}"`
			`return s`
refactor clustering.py into method-specific files. 2021-05-03 18:28:48 +00:00
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00			`@dataclass`
			`class hdbscan_clustering_result(clustering_result):`
			`min_cluster_size:int`
			`min_samples:int`
			`cluster_selection_epsilon:float`
			`cluster_selection_method:str`
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00
			`@dataclass`
			`class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):`
			`pass`

			`class hdbscan_job(clustering_job):`
			`def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):`
			`super().__init__(infile,`
			`outpath,`
			`name,`
			`call=hdbscan_job._hdbscan_clustering,`
			`min_cluster_size=min_cluster_size,`
			`min_samples=min_samples,`
			`cluster_selection_epsilon=cluster_selection_epsilon,`
			`cluster_selection_method=cluster_selection_method`
			`)`

			`self.min_cluster_size = min_cluster_size`
			`self.min_samples = min_samples`
			`self.cluster_selection_epsilon = cluster_selection_epsilon`
			`self.cluster_selection_method = cluster_selection_method`
			`# self.mat = 1 - self.mat`

			`def _hdbscan_clustering(mat, args, *kwargs):`
			`print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")`
			`print(mat)`
			`clusterer = hdbscan.HDBSCAN(metric='precomputed',`
			`core_dist_n_jobs=cpu_count(),`
			`*args,`
			`**kwargs,`
			`)`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`clustering = clusterer.fit(mat.astype('double'))`

			`return(clustering)`

			`def get_info(self):`
			`result = super().get_info()`
			`self.result = hdbscan_clustering_result(**result.__dict__,`
			`min_cluster_size=self.min_cluster_size,`
			`min_samples=self.min_samples,`
			`cluster_selection_epsilon=self.cluster_selection_epsilon,`
			`cluster_selection_method=self.cluster_selection_method)`
			`return self.result`

			`class hdbscan_lsi_job(hdbscan_job, lsi_mixin):`
			`def __init__(self, infile, outpath, name, lsi_dims, args, *kwargs):`
			`super().__init__(`
			`infile,`
			`outpath,`
			`name,`
			`*args,`
			`**kwargs)`
			`super().set_lsi_dims(lsi_dims)`

			`def get_info(self):`
			`partial_result = super().get_info()`
			`self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,`
			`lsi_dimensions=self.lsi_dims)`
			`return self.result`

			`# def select_hdbscan_clustering(inpath,`
			`# outpath,`
			`# outfile=None,`
			`# min_cluster_sizes=[2],`
			`# min_samples=[1],`
			`# cluster_selection_epsilons=[0],`
			`# cluster_selection_methods=['eom'],`
			`# lsi_dimensions='all'`
			`# ):`

			`# inpath = Path(inpath)`
			`# outpath = Path(outpath)`
			`# outpath.mkdir(exist_ok=True, parents=True)`

			`# if lsi_dimensions is None:`
			`# lsi_paths = [inpath]`
			`# elif lsi_dimensions == 'all':`
			`# lsi_paths = list(inpath.glob("*"))`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# else:`
			`# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# if lsi_dimensions is not None:`
			`# lsi_nums = [p.stem for p in lsi_paths]`
			`# else:`
			`# lsi_nums = [None]`
			`# grid = list(product(lsi_nums,`
			`# min_cluster_sizes,`
			`# min_samples,`
			`# cluster_selection_epsilons,`
			`# cluster_selection_methods))`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# # fix the output file names`
			`# names = list(map(lambda t:'_'.join(map(str,t)),grid))`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# with Pool(int(cpu_count()/4)) as pool:`
			`# mods = starmap(hdbscan_clustering, grid)`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# res = pd.DataFrame(mods)`
			`# if outfile is None:`
			`# outfile = outpath / "selection_data.csv"`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# res.to_csv(outfile)`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):`
			`# subreddits, mat = read_similarity_mat(similarities)`
			`# mat = sim_to_dist(mat)`
			`# clustering = _hdbscan_clustering(mat,`
			`# min_cluster_size=min_cluster_size,`
			`# min_samples=min_samples,`
			`# cluster_selection_epsilon=cluster_selection_epsilon,`
			`# cluster_selection_method=cluster_selection_method,`
			`# metric='precomputed',`
			`# core_dist_n_jobs=cpu_count()`
			`# )`

			`# cluster_data = process_clustering_result(clustering, subreddits)`
			`# isolates = clustering.labels_ == -1`
			`# scoremat = mat[~isolates][:,~isolates]`
			`# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')`
			`# cluster_data.to_feather(output)`
			`# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')`
			`# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})`
			`# silsampout = output.parent / ("silhouette_samples" + output.name)`
			`# silhouette_samp.to_feather(silsampout)`

			`# result = hdbscan_clustering_result(outpath=output,`
			`# silhouette_samples=silsampout,`
			`# silhouette_score=score,`
			`# name=name,`
			`# min_cluster_size=min_cluster_size,`
			`# min_samples=min_samples,`
			`# cluster_selection_epsilon=cluster_selection_epsilon,`
			`# cluster_selection_method=cluster_selection_method,`
			`# lsi_dimensions=lsi_dim,`
			`# n_isolates=isolates.sum(),`
			`# n_clusters=len(set(clustering.labels_))`
			`# )`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00


refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# return(result)`

			`# # for all runs we should try cluster_selection_epsilon = None`
			`# # for terms we should try cluster_selection_epsilon around 0.56-0.66`
			`# # for authors we should try cluster_selection_epsilon around 0.98-0.99`
			`# def _hdbscan_clustering(mat, args, *kwargs):`
			`# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")`

			`# print(mat)`
			`# clusterer = hdbscan.HDBSCAN(*args,`
			`# **kwargs,`
			`# )`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# clustering = clusterer.fit(mat.astype('double'))`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`# return(clustering)`
Use Latent semantic indexing and hdbscan 2021-05-03 06:39:55 +00:00
			`def KNN_distances_plot(mat,outname,k=2):`
			`nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)`
			`distances, indices = nbrs.kneighbors(mat)`
			`d2 = distances[:,-1]`
			`df = pd.DataFrame({'dist':d2})`
			`df = df.sort_values("dist",ascending=False)`
			`df['idx'] = np.arange(0,d2.shape[0]) + 1`
			`p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,`
			`breaks = np.arange(0,10)/10)`
			`p.save(outname,width=16,height=10)`

			`def make_KNN_plots():`
			`similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"`
			`subreddits, mat = read_similarity_mat(similarities)`
			`mat = sim_to_dist(mat)`

			`KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')`

			`similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"`
			`subreddits, mat = read_similarity_mat(similarities)`
			`mat = sim_to_dist(mat)`
			`KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')`

			`similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"`
			`subreddits, mat = read_similarity_mat(similarities)`
			`mat = sim_to_dist(mat)`
			`KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')`

			`if __name__ == "__main__":`
refactor clustring in object oriented style 2021-05-08 05:33:26 +00:00			`fire.Fire{'grid_sweep':hdbscan_grid_sweep,`
			`'grid_sweep_lsi':hdbscan_lsi_grid_sweep`
			`'cluster':hdbscan_job,`
			`'cluster_lsi':hdbscan_lsi_job}`

			`# test_select_hdbscan_clustering()`
			`#fire.Fire(select_hdbscan_clustering)`