add 2 more umap parameters
This commit is contained in:
		
							parent
							
								
									5a40465a62
								
							
						
					
					
						commit
						c190791364
					
				| @ -1,10 +1,10 @@ | ||||
| #srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
 | ||||
| srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh | ||||
| srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 | ||||
| similarity_data=/gscratch/comdata/output/reddit_similarity | ||||
| clustering_data=/gscratch/comdata/output/reddit_clustering | ||||
| kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] | ||||
| 
 | ||||
| umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] | ||||
| umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10] | ||||
| 
 | ||||
| hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] | ||||
| affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] | ||||
|  | ||||
| @ -29,7 +29,7 @@ class lsi_grid_sweep(grid_sweep): | ||||
|         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) | ||||
| 
 | ||||
| class twoway_lsi_grid_sweep(twoway_grid_sweep): | ||||
|     def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2, save_step1): | ||||
|     def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2): | ||||
|         self.jobtype = jobtype | ||||
|         self.subsweep = subsweep | ||||
|         inpath = Path(inpath) | ||||
| @ -40,5 +40,5 @@ class twoway_lsi_grid_sweep(twoway_grid_sweep): | ||||
| 
 | ||||
|         lsi_nums = [int(p.stem) for p in lsi_paths] | ||||
|         self.hasrun = False | ||||
|         self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2, save_step1) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] | ||||
|         self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] | ||||
|         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) | ||||
|  | ||||
| @ -63,25 +63,28 @@ class umap_hdbscan_grid_sweep(twoway_grid_sweep): | ||||
|               min_samples, | ||||
|               cluster_selection_epsilon, | ||||
|               cluster_selection_method, | ||||
|               n_components, | ||||
|               n_neighbors, | ||||
|               learning_rate, | ||||
|               min_dist, | ||||
|               local_connectivity | ||||
|               local_connectivity, | ||||
|               densmap | ||||
|               ): | ||||
|         return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}" | ||||
|         return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nc-{n_components}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}_dm-{densmap}" | ||||
| 
 | ||||
| @dataclass | ||||
| class umap_hdbscan_clustering_result(hdbscan_clustering_result): | ||||
|     n_components:int | ||||
|     n_neighbors:int | ||||
|     learning_rate:float | ||||
|     min_dist:float | ||||
|     local_connectivity:int | ||||
|     densmap:bool | ||||
| 
 | ||||
| class umap_hdbscan_job(twoway_clustering_job): | ||||
|     def __init__(self, infile, outpath, name, | ||||
|                  umap_args = {"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1}, | ||||
|                  umap_args = {"n_components":2,"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1,'densmap':False}, | ||||
|                  hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'}, | ||||
|                  save_step1 = False, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
|         super().__init__(infile, | ||||
| @ -91,15 +94,16 @@ class umap_hdbscan_job(twoway_clustering_job): | ||||
|                          call2=umap_hdbscan_job._hdbscan_clustering, | ||||
|                          args1=umap_args, | ||||
|                          args2=hdbscan_args, | ||||
|                          save_step1=save_step1, | ||||
|                          *args, | ||||
|                          **kwargs | ||||
|                          ) | ||||
| 
 | ||||
|         self.n_components = umap_args['n_components'] | ||||
|         self.n_neighbors = umap_args['n_neighbors'] | ||||
|         self.learning_rate = umap_args['learning_rate'] | ||||
|         self.min_dist = umap_args['min_dist'] | ||||
|         self.local_connectivity = umap_args['local_connectivity'] | ||||
|         self.densmap = umap_args['densmap'] | ||||
|         self.min_cluster_size = hdbscan_args['min_cluster_size'] | ||||
|         self.min_samples = hdbscan_args['min_samples'] | ||||
|         self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon'] | ||||
| @ -139,14 +143,17 @@ class umap_hdbscan_job(twoway_clustering_job): | ||||
|                                                      min_samples=self.min_samples, | ||||
|                                                      cluster_selection_epsilon=self.cluster_selection_epsilon, | ||||
|                                                      cluster_selection_method=self.cluster_selection_method, | ||||
|                                                      n_components = self.n_components, | ||||
|                                                      n_neighbors = self.n_neighbors, | ||||
|                                                      learning_rate = self.learning_rate, | ||||
|                                                      min_dist = self.min_dist, | ||||
|                                                      local_connectivity=self.local_connectivity | ||||
|                                                      local_connectivity=self.local_connectivity, | ||||
|                                                      densmap=self.densmap | ||||
|                                                      ) | ||||
|         return self.result | ||||
| 
 | ||||
| def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], | ||||
| def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], | ||||
|                                 densmap=[False], | ||||
|                                 min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): | ||||
|     """Run umap + hdbscan clustering once or more with different parameters. | ||||
|      | ||||
| @ -171,6 +178,8 @@ def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], l | ||||
|                  'learning_rate':list(map(float,learning_rate)), | ||||
|                  'min_dist':list(map(float,min_dist)), | ||||
|                  'local_connectivity':list(map(int,local_connectivity)), | ||||
|                  'n_components':list(map(int, n_components)), | ||||
|                  'densmap':list(map(bool,densmap)) | ||||
|                  } | ||||
| 
 | ||||
|     hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), | ||||
|  | ||||
| @ -9,14 +9,13 @@ class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_res | ||||
|     pass  | ||||
| 
 | ||||
| class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False): | ||||
|     def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims): | ||||
|         super().__init__( | ||||
|             infile, | ||||
|             outpath, | ||||
|             name, | ||||
|             umap_args, | ||||
|             hdbscan_args, | ||||
|             save_step1 | ||||
|             hdbscan_args | ||||
|         ) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| 
 | ||||
| @ -32,8 +31,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  umap_args, | ||||
|                  hdbscan_args, | ||||
|                  save_step1 | ||||
|                  hdbscan_args | ||||
|                  ): | ||||
| 
 | ||||
|         super().__init__(umap_hdbscan_lsi_job, | ||||
| @ -42,8 +40,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          umap_args, | ||||
|                          hdbscan_args, | ||||
|                          save_step1 | ||||
|                          hdbscan_args | ||||
|                          ) | ||||
|          | ||||
| 
 | ||||
| @ -55,11 +52,11 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): | ||||
|                  lsi_dim, | ||||
|                  umap_args, | ||||
|                  hdbscan_args, | ||||
|                  save_step1): | ||||
|                  ): | ||||
| 
 | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = umap_hdbscan_lsi_job | ||||
|         super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim) | ||||
|         super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, lsi_dim) | ||||
| 
 | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
| @ -67,8 +64,9 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
| 
 | ||||
| def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], | ||||
|                                     min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False): | ||||
| def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1],  | ||||
|                                 densmap=[False], | ||||
|                                     min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all'): | ||||
|     """Run hdbscan clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
| @ -90,6 +88,8 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15 | ||||
|                  'learning_rate':list(map(float,learning_rate)), | ||||
|                  'min_dist':list(map(float,min_dist)), | ||||
|                  'local_connectivity':list(map(int,local_connectivity)), | ||||
|                  'n_components':list(map(int, n_components)), | ||||
|                  'densmap':list(map(bool,densmap)) | ||||
|                  } | ||||
| 
 | ||||
|     hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), | ||||
| @ -101,8 +101,7 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15 | ||||
|                                       lsi_dimensions, | ||||
|                                       outpath, | ||||
|                                       umap_args, | ||||
|                                       hdbscan_args, | ||||
|                                       save_step1 | ||||
|                                       hdbscan_args | ||||
|                                       ) | ||||
|                                   | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user