2021-05-10 20:46:49 +00:00
|
|
|
from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
|
|
|
|
from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
|
|
|
|
from grid_sweep import grid_sweep
|
|
|
|
import fire
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
|
|
|
|
pass
|
|
|
|
|
|
|
|
class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
|
|
|
|
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
|
|
|
super().__init__(
|
|
|
|
infile,
|
|
|
|
outpath,
|
|
|
|
name,
|
|
|
|
*args,
|
|
|
|
**kwargs)
|
|
|
|
super().set_lsi_dims(lsi_dims)
|
|
|
|
|
|
|
|
def get_info(self):
|
|
|
|
partial_result = super().get_info()
|
|
|
|
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
|
|
|
|
lsi_dimensions=self.lsi_dims)
|
|
|
|
return self.result
|
|
|
|
|
|
|
|
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
|
|
|
|
def __init__(self,
|
|
|
|
inpath,
|
|
|
|
lsi_dims,
|
|
|
|
outpath,
|
|
|
|
min_cluster_sizes,
|
|
|
|
min_samples,
|
|
|
|
cluster_selection_epsilons,
|
|
|
|
cluster_selection_methods
|
|
|
|
):
|
|
|
|
|
|
|
|
super().__init__(hdbscan_lsi_job,
|
|
|
|
_hdbscan_lsi_grid_sweep,
|
|
|
|
inpath,
|
|
|
|
lsi_dims,
|
|
|
|
outpath,
|
|
|
|
min_cluster_sizes,
|
|
|
|
min_samples,
|
|
|
|
cluster_selection_epsilons,
|
|
|
|
cluster_selection_methods)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _hdbscan_lsi_grid_sweep(grid_sweep):
|
|
|
|
def __init__(self,
|
|
|
|
inpath,
|
|
|
|
outpath,
|
|
|
|
lsi_dim,
|
|
|
|
*args,
|
|
|
|
**kwargs):
|
|
|
|
print(args)
|
|
|
|
print(kwargs)
|
|
|
|
|
|
|
|
self.lsi_dim = lsi_dim
|
|
|
|
self.jobtype = hdbscan_lsi_job
|
2021-08-03 21:55:02 +00:00
|
|
|
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
|
2021-05-10 20:46:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
def namer(self, *args, **kwargs):
|
|
|
|
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
|
|
|
|
s += f"_lsi-{self.lsi_dim}"
|
|
|
|
return s
|
|
|
|
|
2021-08-12 05:48:33 +00:00
|
|
|
def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=[1],lsi_dimensions='all'):
|
2021-05-10 20:46:49 +00:00
|
|
|
"""Run hdbscan clustering once or more with different parameters.
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
|
|
|
|
|
|
|
Keword arguments:
|
|
|
|
savefile: path to save the metadata and diagnostics
|
|
|
|
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
|
|
|
|
outpath: path to output fit clusterings.
|
|
|
|
min_cluster_sizes: one or more integers indicating the minumum cluster size
|
|
|
|
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
|
|
|
|
cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
|
|
|
|
cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters.
|
|
|
|
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
|
|
|
"""
|
|
|
|
|
|
|
|
obj = hdbscan_lsi_grid_sweep(inpath,
|
|
|
|
lsi_dimensions,
|
|
|
|
outpath,
|
2021-08-03 21:55:02 +00:00
|
|
|
list(map(int,min_cluster_sizes)),
|
|
|
|
list(map(int,min_samples)),
|
|
|
|
list(map(float,cluster_selection_epsilons)),
|
2021-08-12 05:48:33 +00:00
|
|
|
cluster_selection_methods)
|
|
|
|
|
2021-05-10 20:46:49 +00:00
|
|
|
|
|
|
|
obj.run(10)
|
|
|
|
obj.save(savefile)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
fire.Fire(run_hdbscan_lsi_grid_sweep)
|