add 2 more umap parameters
This commit is contained in:
parent
5a40465a62
commit
c190791364
@ -1,10 +1,10 @@
|
|||||||
#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
|
#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
|
||||||
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
|
srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40
|
||||||
similarity_data=/gscratch/comdata/output/reddit_similarity
|
similarity_data=/gscratch/comdata/output/reddit_similarity
|
||||||
clustering_data=/gscratch/comdata/output/reddit_clustering
|
clustering_data=/gscratch/comdata/output/reddit_clustering
|
||||||
kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
|
kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
|
||||||
|
|
||||||
umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1]
|
umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10]
|
||||||
|
|
||||||
hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
|
hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
|
||||||
affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
|
affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
|
||||||
|
@ -29,7 +29,7 @@ class lsi_grid_sweep(grid_sweep):
|
|||||||
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
|
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
|
||||||
|
|
||||||
class twoway_lsi_grid_sweep(twoway_grid_sweep):
|
class twoway_lsi_grid_sweep(twoway_grid_sweep):
|
||||||
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2, save_step1):
|
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2):
|
||||||
self.jobtype = jobtype
|
self.jobtype = jobtype
|
||||||
self.subsweep = subsweep
|
self.subsweep = subsweep
|
||||||
inpath = Path(inpath)
|
inpath = Path(inpath)
|
||||||
@ -40,5 +40,5 @@ class twoway_lsi_grid_sweep(twoway_grid_sweep):
|
|||||||
|
|
||||||
lsi_nums = [int(p.stem) for p in lsi_paths]
|
lsi_nums = [int(p.stem) for p in lsi_paths]
|
||||||
self.hasrun = False
|
self.hasrun = False
|
||||||
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2, save_step1) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
|
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
|
||||||
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
|
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
|
||||||
|
@ -63,25 +63,28 @@ class umap_hdbscan_grid_sweep(twoway_grid_sweep):
|
|||||||
min_samples,
|
min_samples,
|
||||||
cluster_selection_epsilon,
|
cluster_selection_epsilon,
|
||||||
cluster_selection_method,
|
cluster_selection_method,
|
||||||
|
n_components,
|
||||||
n_neighbors,
|
n_neighbors,
|
||||||
learning_rate,
|
learning_rate,
|
||||||
min_dist,
|
min_dist,
|
||||||
local_connectivity
|
local_connectivity,
|
||||||
|
densmap
|
||||||
):
|
):
|
||||||
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}"
|
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nc-{n_components}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}_dm-{densmap}"
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class umap_hdbscan_clustering_result(hdbscan_clustering_result):
|
class umap_hdbscan_clustering_result(hdbscan_clustering_result):
|
||||||
|
n_components:int
|
||||||
n_neighbors:int
|
n_neighbors:int
|
||||||
learning_rate:float
|
learning_rate:float
|
||||||
min_dist:float
|
min_dist:float
|
||||||
local_connectivity:int
|
local_connectivity:int
|
||||||
|
densmap:bool
|
||||||
|
|
||||||
class umap_hdbscan_job(twoway_clustering_job):
|
class umap_hdbscan_job(twoway_clustering_job):
|
||||||
def __init__(self, infile, outpath, name,
|
def __init__(self, infile, outpath, name,
|
||||||
umap_args = {"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1},
|
umap_args = {"n_components":2,"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1,'densmap':False},
|
||||||
hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'},
|
hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'},
|
||||||
save_step1 = False,
|
|
||||||
*args,
|
*args,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super().__init__(infile,
|
super().__init__(infile,
|
||||||
@ -91,15 +94,16 @@ class umap_hdbscan_job(twoway_clustering_job):
|
|||||||
call2=umap_hdbscan_job._hdbscan_clustering,
|
call2=umap_hdbscan_job._hdbscan_clustering,
|
||||||
args1=umap_args,
|
args1=umap_args,
|
||||||
args2=hdbscan_args,
|
args2=hdbscan_args,
|
||||||
save_step1=save_step1,
|
|
||||||
*args,
|
*args,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.n_components = umap_args['n_components']
|
||||||
self.n_neighbors = umap_args['n_neighbors']
|
self.n_neighbors = umap_args['n_neighbors']
|
||||||
self.learning_rate = umap_args['learning_rate']
|
self.learning_rate = umap_args['learning_rate']
|
||||||
self.min_dist = umap_args['min_dist']
|
self.min_dist = umap_args['min_dist']
|
||||||
self.local_connectivity = umap_args['local_connectivity']
|
self.local_connectivity = umap_args['local_connectivity']
|
||||||
|
self.densmap = umap_args['densmap']
|
||||||
self.min_cluster_size = hdbscan_args['min_cluster_size']
|
self.min_cluster_size = hdbscan_args['min_cluster_size']
|
||||||
self.min_samples = hdbscan_args['min_samples']
|
self.min_samples = hdbscan_args['min_samples']
|
||||||
self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon']
|
self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon']
|
||||||
@ -139,14 +143,17 @@ class umap_hdbscan_job(twoway_clustering_job):
|
|||||||
min_samples=self.min_samples,
|
min_samples=self.min_samples,
|
||||||
cluster_selection_epsilon=self.cluster_selection_epsilon,
|
cluster_selection_epsilon=self.cluster_selection_epsilon,
|
||||||
cluster_selection_method=self.cluster_selection_method,
|
cluster_selection_method=self.cluster_selection_method,
|
||||||
|
n_components = self.n_components,
|
||||||
n_neighbors = self.n_neighbors,
|
n_neighbors = self.n_neighbors,
|
||||||
learning_rate = self.learning_rate,
|
learning_rate = self.learning_rate,
|
||||||
min_dist = self.min_dist,
|
min_dist = self.min_dist,
|
||||||
local_connectivity=self.local_connectivity
|
local_connectivity=self.local_connectivity,
|
||||||
|
densmap=self.densmap
|
||||||
)
|
)
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1],
|
def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1],
|
||||||
|
densmap=[False],
|
||||||
min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
|
min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
|
||||||
"""Run umap + hdbscan clustering once or more with different parameters.
|
"""Run umap + hdbscan clustering once or more with different parameters.
|
||||||
|
|
||||||
@ -171,6 +178,8 @@ def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], l
|
|||||||
'learning_rate':list(map(float,learning_rate)),
|
'learning_rate':list(map(float,learning_rate)),
|
||||||
'min_dist':list(map(float,min_dist)),
|
'min_dist':list(map(float,min_dist)),
|
||||||
'local_connectivity':list(map(int,local_connectivity)),
|
'local_connectivity':list(map(int,local_connectivity)),
|
||||||
|
'n_components':list(map(int, n_components)),
|
||||||
|
'densmap':list(map(bool,densmap))
|
||||||
}
|
}
|
||||||
|
|
||||||
hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
|
hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
|
||||||
|
@ -9,14 +9,13 @@ class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_res
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin):
|
class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin):
|
||||||
def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False):
|
def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
infile,
|
infile,
|
||||||
outpath,
|
outpath,
|
||||||
name,
|
name,
|
||||||
umap_args,
|
umap_args,
|
||||||
hdbscan_args,
|
hdbscan_args
|
||||||
save_step1
|
|
||||||
)
|
)
|
||||||
super().set_lsi_dims(lsi_dims)
|
super().set_lsi_dims(lsi_dims)
|
||||||
|
|
||||||
@ -32,8 +31,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
|
|||||||
lsi_dims,
|
lsi_dims,
|
||||||
outpath,
|
outpath,
|
||||||
umap_args,
|
umap_args,
|
||||||
hdbscan_args,
|
hdbscan_args
|
||||||
save_step1
|
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__(umap_hdbscan_lsi_job,
|
super().__init__(umap_hdbscan_lsi_job,
|
||||||
@ -42,8 +40,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
|
|||||||
lsi_dims,
|
lsi_dims,
|
||||||
outpath,
|
outpath,
|
||||||
umap_args,
|
umap_args,
|
||||||
hdbscan_args,
|
hdbscan_args
|
||||||
save_step1
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -55,11 +52,11 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
|
|||||||
lsi_dim,
|
lsi_dim,
|
||||||
umap_args,
|
umap_args,
|
||||||
hdbscan_args,
|
hdbscan_args,
|
||||||
save_step1):
|
):
|
||||||
|
|
||||||
self.lsi_dim = lsi_dim
|
self.lsi_dim = lsi_dim
|
||||||
self.jobtype = umap_hdbscan_lsi_job
|
self.jobtype = umap_hdbscan_lsi_job
|
||||||
super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim)
|
super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, lsi_dim)
|
||||||
|
|
||||||
|
|
||||||
def namer(self, *args, **kwargs):
|
def namer(self, *args, **kwargs):
|
||||||
@ -67,8 +64,9 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
|
|||||||
s += f"_lsi-{self.lsi_dim}"
|
s += f"_lsi-{self.lsi_dim}"
|
||||||
return s
|
return s
|
||||||
|
|
||||||
def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1],
|
def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1],
|
||||||
min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False):
|
densmap=[False],
|
||||||
|
min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all'):
|
||||||
"""Run hdbscan clustering once or more with different parameters.
|
"""Run hdbscan clustering once or more with different parameters.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
@ -90,6 +88,8 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15
|
|||||||
'learning_rate':list(map(float,learning_rate)),
|
'learning_rate':list(map(float,learning_rate)),
|
||||||
'min_dist':list(map(float,min_dist)),
|
'min_dist':list(map(float,min_dist)),
|
||||||
'local_connectivity':list(map(int,local_connectivity)),
|
'local_connectivity':list(map(int,local_connectivity)),
|
||||||
|
'n_components':list(map(int, n_components)),
|
||||||
|
'densmap':list(map(bool,densmap))
|
||||||
}
|
}
|
||||||
|
|
||||||
hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
|
hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
|
||||||
@ -101,8 +101,7 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15
|
|||||||
lsi_dimensions,
|
lsi_dimensions,
|
||||||
outpath,
|
outpath,
|
||||||
umap_args,
|
umap_args,
|
||||||
hdbscan_args,
|
hdbscan_args
|
||||||
save_step1
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user