bug fix in affinity clustering
This commit is contained in:
		
							parent
							
								
									4cb7eeec80
								
							
						
					
					
						commit
						582cf263ea
					
				| @ -4,7 +4,7 @@ similarity_data=/gscratch/comdata/output/reddit_similarity | ||||
| clustering_data=/gscratch/comdata/output/reddit_clustering | ||||
| kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" | ||||
| hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" | ||||
| affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]" | ||||
| affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]" | ||||
| 
 | ||||
| authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather | ||||
| authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI | ||||
|  | ||||
| @ -81,7 +81,7 @@ class affinity_grid_sweep(grid_sweep): | ||||
| 
 | ||||
|         return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" | ||||
| 
 | ||||
| def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]): | ||||
| def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5],n_cores=10): | ||||
|     """Run affinity clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
| @ -102,7 +102,7 @@ def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters | ||||
|                          map(int,max_iters), | ||||
|                          map(int,convergence_iters), | ||||
|                          map(float,preference_quantiles)) | ||||
|     obj.run(1) | ||||
|     obj.run(n_cores) | ||||
|     obj.save(savefile) | ||||
|      | ||||
| def test_select_affinity_clustering(): | ||||
|  | ||||
| @ -58,7 +58,7 @@ class _affinity_lsi_grid_sweep(grid_sweep): | ||||
|                          inpath, | ||||
|                          outpath, | ||||
|                          self.namer, | ||||
|                          self.lsi_dim, | ||||
|                          [self.lsi_dim], | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
| 
 | ||||
| @ -67,7 +67,7 @@ class _affinity_lsi_grid_sweep(grid_sweep): | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
|                           | ||||
| def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'): | ||||
| def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30): | ||||
|     """Run affinity clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
| @ -92,7 +92,7 @@ def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_i | ||||
|                             map(int,convergence_iters), | ||||
|                             map(float,preference_quantiles)) | ||||
| 
 | ||||
|     obj.run(1) | ||||
|     obj.run(n_cores) | ||||
|     obj.save(savefile) | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|  | ||||
| @ -3,6 +3,7 @@ import numpy as np | ||||
| import pandas as pd | ||||
| from dataclasses import dataclass | ||||
| from sklearn.metrics import silhouette_score, silhouette_samples | ||||
| from collections import Counter | ||||
| 
 | ||||
| # this is meant to be an interface, not created directly | ||||
| class clustering_job: | ||||
| @ -38,9 +39,11 @@ class clustering_job: | ||||
|         return self.result | ||||
| 
 | ||||
|     def silhouette(self): | ||||
|         isolates = self.clustering.labels_ == -1 | ||||
|         counts = Counter(self.clustering.labels_) | ||||
|         singletons = [key for key, value in counts.items() if value == 1] | ||||
|         isolates = (self.clustering.labels_ == -1) | (np.isin(self.clustering.labels_,np.array(singletons))) | ||||
|         scoremat = self.mat[~isolates][:,~isolates] | ||||
|         if scoremat.shape[0] > 0: | ||||
|         if self.n_clusters > 1: | ||||
|             score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') | ||||
|             silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') | ||||
|             silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) | ||||
| @ -80,8 +83,9 @@ class clustering_job: | ||||
| 
 | ||||
|         print(f"{n_isolates1} clusters have 1 member") | ||||
| 
 | ||||
|         n_isolates2 = (cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']]) | ||||
| 
 | ||||
|         n_isolates2 = cluster_sizes.loc[cluster_sizes.cluster==-1,:]['subreddit'].to_list() | ||||
|         if len(n_isolates2) > 0: | ||||
|             n_isloates2 = n_isolates2[0] | ||||
|         print(f"{n_isolates2} subreddits are in cluster -1",flush=True) | ||||
| 
 | ||||
|         if n_isolates1 == 0: | ||||
|  | ||||
| @ -17,7 +17,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000 | ||||
|     df = pd.read_feather(similarities) | ||||
| 
 | ||||
|     n = df.shape[0] | ||||
|     mat = np.array(df.drop('subreddit',1),dtype=np.float64) | ||||
|     mat = np.array(df.drop('_subreddit',1),dtype=np.float64) | ||||
|     mat[range(n),range(n)] = 1 | ||||
|     mat[mat > 1] = 1 | ||||
|     dist = 2*np.arccos(mat)/np.pi | ||||
| @ -26,7 +26,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000 | ||||
| 
 | ||||
|     tsne_fit_whole = tsne_fit_model.fit_transform(dist) | ||||
| 
 | ||||
|     plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit}) | ||||
|     plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']}) | ||||
| 
 | ||||
|     plot_data.to_feather(output) | ||||
| 
 | ||||
|  | ||||
| @ -20,9 +20,9 @@ class lsi_grid_sweep(grid_sweep): | ||||
|         if lsi_dimensions == 'all': | ||||
|             lsi_paths = list(inpath.glob("*")) | ||||
|         else: | ||||
|             lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] | ||||
|             lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions] | ||||
| 
 | ||||
|         lsi_nums = [p.stem for p in lsi_paths] | ||||
|         lsi_nums = [int(p.stem) for p in lsi_paths] | ||||
|         self.hasrun = False | ||||
|         self.subgrids = [self.subsweep(lsi_path, outpath,  lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] | ||||
|         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user