Refactor to make a decent api.
This commit is contained in:
		
							parent
							
								
									f05cb962e0
								
							
						
					
					
						commit
						4cb7eeec80
					
				| @ -2,41 +2,160 @@ | ||||
| srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh | ||||
| similarity_data=/gscratch/comdata/output/reddit_similarity | ||||
| clustering_data=/gscratch/comdata/output/reddit_clustering | ||||
| kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]" | ||||
| #selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
 | ||||
| all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv | ||||
| # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
 | ||||
| # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
 | ||||
| kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" | ||||
| hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" | ||||
| affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]" | ||||
| 
 | ||||
| $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py | ||||
| 	$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid) | ||||
| authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather | ||||
| authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI | ||||
| authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k | ||||
| authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI | ||||
| 
 | ||||
| $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py | ||||
| 	$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans  $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid) | ||||
| authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather | ||||
| authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI | ||||
| authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k | ||||
| authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI | ||||
| 
 | ||||
| $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather | ||||
| 	$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans  $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid) | ||||
| terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather | ||||
| terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI | ||||
| terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k | ||||
| terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI | ||||
| 
 | ||||
| all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi | ||||
| 
 | ||||
| terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv | ||||
| 
 | ||||
| authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv | ||||
| 
 | ||||
| terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv | ||||
| 
 | ||||
| authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv | ||||
| 
 | ||||
| ${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py | ||||
| 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)  | ||||
| 
 | ||||
| ${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py | ||||
| 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans  --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)  | ||||
| 
 | ||||
| ${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py | ||||
| 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)  | ||||
| 
 | ||||
| ${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py | ||||
| 	$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)  | ||||
| 
 | ||||
| ${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py | ||||
| 	$(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity  --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)  | ||||
| 
 | ||||
| ${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py | ||||
| 	$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)  | ||||
| 
 | ||||
| ${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py | ||||
| 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)  | ||||
| 
 | ||||
| ${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py | ||||
| 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan  --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)  | ||||
| 
 | ||||
| ${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py | ||||
| 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)  | ||||
| 
 | ||||
| 
 | ||||
| affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" | ||||
| $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py | ||||
| 	$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 | ||||
| ## LSI Models
 | ||||
| ${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py | ||||
| 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) | ||||
| 
 | ||||
| $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py | ||||
| 	$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity  $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20  | ||||
| ${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py | ||||
| 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans  --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) | ||||
| 
 | ||||
| $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather | ||||
| 	$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity  $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 | ||||
| ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py | ||||
| 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) | ||||
| 
 | ||||
| clean: | ||||
| 	rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv | ||||
| 	rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv | ||||
| 	rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv | ||||
| 	rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv | ||||
| 	rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv | ||||
| 	rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv | ||||
| ${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py | ||||
| 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) | ||||
| 
 | ||||
| PHONY: clean | ||||
| ${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py | ||||
| 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity  --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) | ||||
| 
 | ||||
| ${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py | ||||
| 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) | ||||
| 
 | ||||
| ${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py | ||||
| 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) | ||||
| 
 | ||||
| ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py | ||||
| 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan  --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) | ||||
| 
 | ||||
| ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py | ||||
| 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| clean_affinity: | ||||
| 	rm -f ${authors_10k_output}/affinity/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output}/affinity/selection_data.csv | ||||
| 	rm -f ${terms_10k_output}/affinity/selection_data.csv | ||||
| 
 | ||||
| clean_kmeans: | ||||
| 	rm -f ${authors_10k_output}/kmeans/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv | ||||
| 	rm -f ${terms_10k_output}/kmeans/selection_data.csv | ||||
| 
 | ||||
| clean_hdbscan: | ||||
| 	rm -f ${authors_10k_output}/hdbscan/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv | ||||
| 	rm -f ${terms_10k_output}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_authors: | ||||
| 	rm -f ${authors_10k_output}/affinity/selection_data.csv | ||||
| 	rm -f ${authors_10k_output}/kmeans/selection_data.csv | ||||
| 	rm -f ${authors_10k_output}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_authors_tf: | ||||
| 	rm -f ${authors_tf_10k_output}/affinity/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_terms: | ||||
| 	rm -f ${terms_10k_output}/affinity/selection_data.csv | ||||
| 	rm -f ${terms_10k_output}/kmeans/selection_data.csv | ||||
| 	rm -f ${terms_10k_output}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_lsi_affinity: | ||||
| 	rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv | ||||
| 	rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv | ||||
| 
 | ||||
| clean_lsi_kmeans: | ||||
| 	rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv | ||||
| 	rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv | ||||
| 
 | ||||
| clean_lsi_hdbscan: | ||||
| 	rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 	rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_lsi_authors: | ||||
| 	rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv | ||||
| 	rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv | ||||
| 	rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_lsi_authors_tf: | ||||
| 	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv | ||||
| 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean_lsi_terms: | ||||
| 	rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv | ||||
| 	rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv | ||||
| 	rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv | ||||
| 
 | ||||
| clean: clean_affinity clean_kmeans clean_hdbscan | ||||
| 
 | ||||
| PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k | ||||
| 
 | ||||
| # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
 | ||||
| # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
 | ||||
|  | ||||
| @ -1,16 +1,12 @@ | ||||
| from sklearn.metrics import silhouette_score | ||||
| from sklearn.cluster import AffinityPropagation | ||||
| from functools import partial | ||||
| from dataclasses import dataclass | ||||
| from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat | ||||
| from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep | ||||
| from multiprocessing  import Pool, cpu_count, Array, Process | ||||
| from clustering_base import clustering_result, clustering_job | ||||
| from grid_sweep import grid_sweep | ||||
| from pathlib import Path | ||||
| from itertools import product, starmap | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| import fire | ||||
| import sys | ||||
| import numpy as np | ||||
| 
 | ||||
| # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.  | ||||
| @dataclass | ||||
| @ -21,10 +17,6 @@ class affinity_clustering_result(clustering_result): | ||||
|     preference:float | ||||
|     max_iter:int | ||||
| 
 | ||||
| @dataclass | ||||
| class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin): | ||||
|     pass | ||||
| 
 | ||||
| class affinity_job(clustering_job): | ||||
|     def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): | ||||
|         super().__init__(infile, | ||||
| @ -67,21 +59,6 @@ class affinity_job(clustering_job): | ||||
| 
 | ||||
|         return self.result | ||||
| 
 | ||||
| class affinity_lsi_job(affinity_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): | ||||
|         super().__init__(infile, | ||||
|                          outpath, | ||||
|                          name, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| 
 | ||||
|     def get_info(self): | ||||
|         result = super().get_info() | ||||
|         self.result = affinity_clustering_result_lsi(**result.__dict__, | ||||
|                                                      lsi_dimensions=self.lsi_dims) | ||||
|         return self.result | ||||
| 
 | ||||
| class affinity_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
| @ -104,49 +81,29 @@ class affinity_grid_sweep(grid_sweep): | ||||
| 
 | ||||
|         return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" | ||||
| 
 | ||||
| class _affinity_lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
|                  lsi_dim, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = affinity_lsi_job | ||||
|         super().__init__(self.jobtype, | ||||
|                          inpath, | ||||
| def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]): | ||||
|     """Run affinity clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
|     affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> | ||||
| 
 | ||||
|     Keword arguments: | ||||
|     savefile: path to save the metadata and diagnostics  | ||||
|     inpath: path to feather data containing a labeled matrix of subreddit similarities. | ||||
|     outpath: path to output fit kmeans clusterings. | ||||
|     dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.  | ||||
|     preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter. | ||||
|     convergence_iters:one or more integers of number of iterations without improvement before stopping. | ||||
|     max_iters: one or more numbers of different maximum interations. | ||||
|     """ | ||||
|     obj = affinity_grid_sweep(inpath, | ||||
|                          outpath, | ||||
|                          self.namer, | ||||
|                          self.lsi_dim, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
|         s = affinity_grid_sweep.namer(self, *args[1:], **kwargs) | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
| 
 | ||||
| class affinity_lsi_grid_sweep(lsi_grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  dampings=[0.9], | ||||
|                  max_iters=[10000], | ||||
|                  convergence_iters=[30], | ||||
|                  preference_quantiles=[0.5]): | ||||
| 
 | ||||
|         super().__init__(affinity_lsi_job, | ||||
|                          _affinity_lsi_grid_sweep, | ||||
|                          inpath, | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          dampings, | ||||
|                          max_iters, | ||||
|                          convergence_iters, | ||||
|                          preference_quantiles) | ||||
|      | ||||
|                           | ||||
|                          map(float,dampings), | ||||
|                          map(int,max_iters), | ||||
|                          map(int,convergence_iters), | ||||
|                          map(float,preference_quantiles)) | ||||
|     obj.run(1) | ||||
|     obj.save(savefile) | ||||
|      | ||||
| def test_select_affinity_clustering(): | ||||
|     # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", | ||||
| @ -169,7 +126,4 @@ def test_select_affinity_clustering(): | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     fire.Fire{'grid_sweep':affinity_grid_sweep, | ||||
|               'grid_sweep_lsi':affinity_lsi_grid_sweep | ||||
|               'cluster':affinity_job, | ||||
|               'cluster_lsi':affinity_lsi_job} | ||||
|     fire.Fire(run_affinity_grid_sweep) | ||||
|  | ||||
							
								
								
									
										99
									
								
								clustering/affinity_clustering_lsi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										99
									
								
								clustering/affinity_clustering_lsi.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,99 @@ | ||||
| import fire | ||||
| from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep | ||||
| from grid_sweep import grid_sweep | ||||
| from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin | ||||
| from dataclasses import dataclass | ||||
| 
 | ||||
| @dataclass | ||||
| class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class affinity_lsi_job(affinity_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): | ||||
|         super().__init__(infile, | ||||
|                          outpath, | ||||
|                          name, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| 
 | ||||
|     def get_info(self): | ||||
|         result = super().get_info() | ||||
|         self.result = affinity_clustering_result_lsi(**result.__dict__, | ||||
|                                                      lsi_dimensions=self.lsi_dims) | ||||
|         return self.result | ||||
|      | ||||
| class affinity_lsi_grid_sweep(lsi_grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  dampings=[0.9], | ||||
|                  max_iters=[10000], | ||||
|                  convergence_iters=[30], | ||||
|                  preference_quantiles=[0.5]): | ||||
| 
 | ||||
|         super().__init__(affinity_lsi_job, | ||||
|                          _affinity_lsi_grid_sweep, | ||||
|                          inpath, | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          dampings, | ||||
|                          max_iters, | ||||
|                          convergence_iters, | ||||
|                          preference_quantiles) | ||||
|      | ||||
| 
 | ||||
| class _affinity_lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
|                  lsi_dim, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = affinity_lsi_job | ||||
|         super().__init__(self.jobtype, | ||||
|                          inpath, | ||||
|                          outpath, | ||||
|                          self.namer, | ||||
|                          self.lsi_dim, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
|         s = affinity_grid_sweep.namer(self, *args[1:], **kwargs) | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
|                           | ||||
| def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'): | ||||
|     """Run affinity clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
|     affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. | ||||
| 
 | ||||
|     Keword arguments: | ||||
|     savefile: path to save the metadata and diagnostics  | ||||
|     inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. | ||||
|     outpath: path to output fit kmeans clusterings. | ||||
|     dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.  | ||||
|     preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter. | ||||
|     convergence_iters:one or more integers of number of iterations without improvement before stopping. | ||||
|     max_iters: one or more numbers of different maximum interations. | ||||
|     lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. | ||||
|     """ | ||||
|      | ||||
|     obj = affinity_lsi_grid_sweep(inpath, | ||||
|                             lsi_dimensions, | ||||
|                             outpath, | ||||
|                             map(float,dampings), | ||||
|                             map(int,max_iters), | ||||
|                             map(int,convergence_iters), | ||||
|                             map(float,preference_quantiles)) | ||||
| 
 | ||||
|     obj.run(1) | ||||
|     obj.save(savefile) | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     fire.Fire(run_affinity_lsi_grid_sweep) | ||||
| @ -3,59 +3,6 @@ import numpy as np | ||||
| import pandas as pd | ||||
| from dataclasses import dataclass | ||||
| from sklearn.metrics import silhouette_score, silhouette_samples | ||||
| from itertools import product, chain | ||||
| from multiprocessing import Pool, cpu_count | ||||
| 
 | ||||
| def sim_to_dist(mat): | ||||
|     dist = 1-mat | ||||
|     dist[dist < 0] = 0 | ||||
|     np.fill_diagonal(dist,0) | ||||
|     return dist | ||||
| 
 | ||||
| class grid_sweep: | ||||
|     def __init__(self, jobtype, inpath, outpath, namer, *args): | ||||
|         self.jobtype = jobtype | ||||
|         self.namer = namer | ||||
|         grid = list(product(*args)) | ||||
|         inpath = Path(inpath) | ||||
|         outpath = Path(outpath) | ||||
|         self.hasrun = False | ||||
|         self.grid = [(inpath,outpath,namer(*g)) + g for g in grid] | ||||
|         self.jobs = [jobtype(*g) for g in self.grid] | ||||
| 
 | ||||
|     def run(self, cores=20): | ||||
|         if cores is not None and cores > 1: | ||||
|             with Pool(cores) as pool: | ||||
|                 infos = pool.map(self.jobtype.get_info, self.jobs) | ||||
|         else: | ||||
|             infos = map(self.jobtype.get_info, self.jobs) | ||||
| 
 | ||||
|         self.infos = pd.DataFrame(infos) | ||||
|         self.hasrun = True | ||||
| 
 | ||||
|     def save(self, outcsv): | ||||
|         if not self.hasrun: | ||||
|             self.run() | ||||
|         outcsv = Path(outcsv) | ||||
|         outcsv.parent.mkdir(parents=True, exist_ok=True) | ||||
|         self.infos.to_csv(outcsv) | ||||
| 
 | ||||
| 
 | ||||
| class lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs): | ||||
|         self.jobtype = jobtype | ||||
|         self.subsweep = subsweep | ||||
|         inpath = Path(inpath) | ||||
|         if lsi_dimensions == 'all': | ||||
|             lsi_paths = list(inpath.glob("*")) | ||||
|         else: | ||||
|             lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] | ||||
| 
 | ||||
|         lsi_nums = [p.stem for p in lsi_paths] | ||||
|         self.hasrun = False | ||||
|         self.subgrids = [self.subsweep(lsi_path, outpath,  lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] | ||||
|         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) | ||||
| 
 | ||||
| 
 | ||||
| # this is meant to be an interface, not created directly | ||||
| class clustering_job: | ||||
| @ -86,19 +33,24 @@ class clustering_job: | ||||
|                                         name=self.name, | ||||
|                                         n_clusters=self.n_clusters, | ||||
|                                         n_isolates=self.n_isolates, | ||||
|                                         silhouette_samples = str(self.silsampout.resolve()) | ||||
|                                         silhouette_samples = self.silsampout | ||||
|                                         ) | ||||
|         return self.result | ||||
| 
 | ||||
|     def silhouette(self): | ||||
|         isolates = self.clustering.labels_ == -1 | ||||
|         scoremat = self.mat[~isolates][:,~isolates] | ||||
|         score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') | ||||
|         silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') | ||||
|         silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) | ||||
|         self.outpath.mkdir(parents=True, exist_ok=True) | ||||
|         self.silsampout = self.outpath / ("silhouette_samples-" + self.name +  ".feather") | ||||
|         silhouette_samp.to_feather(self.silsampout) | ||||
|         if scoremat.shape[0] > 0: | ||||
|             score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') | ||||
|             silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') | ||||
|             silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) | ||||
|             self.outpath.mkdir(parents=True, exist_ok=True) | ||||
|             silsampout = self.outpath / ("silhouette_samples-" + self.name +  ".feather") | ||||
|             self.silsampout = silsampout.resolve() | ||||
|             silhouette_samp.to_feather(self.silsampout) | ||||
|         else: | ||||
|             score = None | ||||
|             self.silsampout = None | ||||
|         return score | ||||
| 
 | ||||
|     def read_distance_mat(self, similarities, use_threads=True): | ||||
| @ -139,11 +91,6 @@ class clustering_job: | ||||
| 
 | ||||
|         return cluster_data | ||||
| 
 | ||||
| 
 | ||||
| class lsi_mixin(): | ||||
|     def set_lsi_dims(self, lsi_dims): | ||||
|         self.lsi_dims = lsi_dims | ||||
| 
 | ||||
| @dataclass | ||||
| class clustering_result: | ||||
|     outpath:Path | ||||
| @ -152,7 +99,3 @@ class clustering_result: | ||||
|     n_clusters:int | ||||
|     n_isolates:int | ||||
|     silhouette_samples:str | ||||
| 
 | ||||
| @dataclass | ||||
| class lsi_result_mixin: | ||||
|     lsi_dimensions:int | ||||
|  | ||||
							
								
								
									
										32
									
								
								clustering/grid_sweep.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								clustering/grid_sweep.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,32 @@ | ||||
| from pathlib import Path | ||||
| from multiprocessing import Pool, cpu_count | ||||
| from itertools import product, chain | ||||
| import pandas as pd | ||||
| 
 | ||||
| class grid_sweep: | ||||
|     def __init__(self, jobtype, inpath, outpath, namer, *args): | ||||
|         self.jobtype = jobtype | ||||
|         self.namer = namer | ||||
|         grid = list(product(*args)) | ||||
|         inpath = Path(inpath) | ||||
|         outpath = Path(outpath) | ||||
|         self.hasrun = False | ||||
|         self.grid = [(inpath,outpath,namer(*g)) + g for g in grid] | ||||
|         self.jobs = [jobtype(*g) for g in self.grid] | ||||
| 
 | ||||
|     def run(self, cores=20): | ||||
|         if cores is not None and cores > 1: | ||||
|             with Pool(cores) as pool: | ||||
|                 infos = pool.map(self.jobtype.get_info, self.jobs) | ||||
|         else: | ||||
|             infos = map(self.jobtype.get_info, self.jobs) | ||||
| 
 | ||||
|         self.infos = pd.DataFrame(infos) | ||||
|         self.hasrun = True | ||||
| 
 | ||||
|     def save(self, outcsv): | ||||
|         if not self.hasrun: | ||||
|             self.run() | ||||
|         outcsv = Path(outcsv) | ||||
|         outcsv.parent.mkdir(parents=True, exist_ok=True) | ||||
|         self.infos.to_csv(outcsv) | ||||
| @ -1,5 +1,5 @@ | ||||
| from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat | ||||
| from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep | ||||
| from clustering_base import clustering_result, clustering_job | ||||
| from grid_sweep import grid_sweep | ||||
| from dataclasses import dataclass | ||||
| import hdbscan | ||||
| from sklearn.neighbors import NearestNeighbors | ||||
| @ -7,11 +7,8 @@ import plotnine as pn | ||||
| import numpy as np | ||||
| from itertools import product, starmap, chain | ||||
| import pandas as pd | ||||
| from sklearn.metrics import silhouette_score, silhouette_samples | ||||
| from pathlib import Path | ||||
| from multiprocessing import Pool, cpu_count | ||||
| from multiprocessing import cpu_count | ||||
| import fire | ||||
| from pyarrow.feather import write_feather | ||||
| 
 | ||||
| def test_select_hdbscan_clustering(): | ||||
|     # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", | ||||
| @ -40,28 +37,6 @@ def test_select_hdbscan_clustering(): | ||||
|     # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") | ||||
|     # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") | ||||
|     # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering) | ||||
| 
 | ||||
| class hdbscan_lsi_grid_sweep(lsi_grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  min_cluster_sizes, | ||||
|                  min_samples, | ||||
|                  cluster_selection_epsilons, | ||||
|                  cluster_selection_methods | ||||
|                  ): | ||||
| 
 | ||||
|         super().__init__(hdbscan_lsi_job, | ||||
|                          _hdbscan_lsi_grid_sweep, | ||||
|                          inpath, | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          min_cluster_sizes, | ||||
|                          min_samples, | ||||
|                          cluster_selection_epsilons, | ||||
|                          cluster_selection_methods) | ||||
|          | ||||
| class hdbscan_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
| @ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep): | ||||
|               cluster_selection_method): | ||||
|         return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" | ||||
| 
 | ||||
| 
 | ||||
| class _hdbscan_lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
|                  lsi_dim, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
| 
 | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = hdbscan_lsi_job | ||||
|         super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) | ||||
| 
 | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
|         s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
| 
 | ||||
| @dataclass | ||||
| class hdbscan_clustering_result(clustering_result): | ||||
|     min_cluster_size:int | ||||
| @ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result): | ||||
|     cluster_selection_epsilon:float | ||||
|     cluster_selection_method:str | ||||
| 
 | ||||
| @dataclass | ||||
| class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): | ||||
|     pass  | ||||
| 
 | ||||
| class hdbscan_job(clustering_job): | ||||
|     def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): | ||||
|         super().__init__(infile, | ||||
| @ -148,121 +100,29 @@ class hdbscan_job(clustering_job): | ||||
|                                                 cluster_selection_method=self.cluster_selection_method) | ||||
|         return self.result | ||||
| 
 | ||||
| class hdbscan_lsi_job(hdbscan_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): | ||||
|         super().__init__( | ||||
|                          infile, | ||||
|                          outpath, | ||||
|                          name, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| def run_hdbscan_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): | ||||
|     """Run hdbscan clustering once or more with different parameters. | ||||
|      | ||||
|     def get_info(self): | ||||
|         partial_result = super().get_info() | ||||
|         self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, | ||||
|                                                     lsi_dimensions=self.lsi_dims) | ||||
|         return self.result | ||||
|     Usage: | ||||
|     hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf"> | ||||
| 
 | ||||
| # def select_hdbscan_clustering(inpath, | ||||
| #                               outpath, | ||||
| #                               outfile=None, | ||||
| #                               min_cluster_sizes=[2], | ||||
| #                               min_samples=[1], | ||||
| #                               cluster_selection_epsilons=[0], | ||||
| #                               cluster_selection_methods=['eom'], | ||||
| #                               lsi_dimensions='all' | ||||
| #                               ): | ||||
| 
 | ||||
| #     inpath = Path(inpath) | ||||
| #     outpath = Path(outpath) | ||||
| #     outpath.mkdir(exist_ok=True, parents=True) | ||||
|      | ||||
| #     if lsi_dimensions is None: | ||||
| #         lsi_paths = [inpath] | ||||
| #     elif lsi_dimensions == 'all': | ||||
| #         lsi_paths = list(inpath.glob("*")) | ||||
| 
 | ||||
| #     else: | ||||
| #         lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] | ||||
| 
 | ||||
| #     if lsi_dimensions is not None: | ||||
| #         lsi_nums = [p.stem for p in lsi_paths] | ||||
| #     else: | ||||
| #         lsi_nums = [None] | ||||
| #     grid = list(product(lsi_nums, | ||||
| #                         min_cluster_sizes, | ||||
| #                         min_samples, | ||||
| #                         cluster_selection_epsilons, | ||||
| #                         cluster_selection_methods)) | ||||
| 
 | ||||
| #     # fix the output file names | ||||
| #     names = list(map(lambda t:'_'.join(map(str,t)),grid)) | ||||
| 
 | ||||
| #     grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] | ||||
|          | ||||
| #     with Pool(int(cpu_count()/4)) as pool: | ||||
| #         mods = starmap(hdbscan_clustering, grid) | ||||
| 
 | ||||
| #     res = pd.DataFrame(mods) | ||||
| #     if outfile is None: | ||||
| #         outfile = outpath / "selection_data.csv" | ||||
| 
 | ||||
| #     res.to_csv(outfile) | ||||
| 
 | ||||
| # def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): | ||||
| #     subreddits, mat = read_similarity_mat(similarities) | ||||
| #     mat = sim_to_dist(mat) | ||||
| #     clustering = _hdbscan_clustering(mat, | ||||
| #                                      min_cluster_size=min_cluster_size, | ||||
| #                                      min_samples=min_samples, | ||||
| #                                      cluster_selection_epsilon=cluster_selection_epsilon, | ||||
| #                                      cluster_selection_method=cluster_selection_method, | ||||
| #                                      metric='precomputed', | ||||
| #                                      core_dist_n_jobs=cpu_count() | ||||
| #                                      ) | ||||
| 
 | ||||
| #     cluster_data = process_clustering_result(clustering, subreddits) | ||||
| #     isolates = clustering.labels_ == -1 | ||||
| #     scoremat = mat[~isolates][:,~isolates] | ||||
| #     score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') | ||||
| #     cluster_data.to_feather(output) | ||||
| #     silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') | ||||
| #     silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) | ||||
| #     silsampout = output.parent / ("silhouette_samples" + output.name) | ||||
| #     silhouette_samp.to_feather(silsampout) | ||||
| 
 | ||||
| #     result = hdbscan_clustering_result(outpath=output, | ||||
| #                                        silhouette_samples=silsampout, | ||||
| #                                        silhouette_score=score, | ||||
| #                                        name=name, | ||||
| #                                        min_cluster_size=min_cluster_size, | ||||
| #                                        min_samples=min_samples, | ||||
| #                                        cluster_selection_epsilon=cluster_selection_epsilon, | ||||
| #                                        cluster_selection_method=cluster_selection_method, | ||||
| #                                        lsi_dimensions=lsi_dim, | ||||
| #                                        n_isolates=isolates.sum(), | ||||
| #                                        n_clusters=len(set(clustering.labels_)) | ||||
| #                                    ) | ||||
| 
 | ||||
| 
 | ||||
|                                         | ||||
| #     return(result) | ||||
| 
 | ||||
| # # for all runs we should try cluster_selection_epsilon = None | ||||
| # # for terms we should try cluster_selection_epsilon around 0.56-0.66 | ||||
| # # for authors we should try cluster_selection_epsilon around 0.98-0.99 | ||||
| # def _hdbscan_clustering(mat, *args, **kwargs): | ||||
| #     print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") | ||||
| 
 | ||||
| #     print(mat) | ||||
| #     clusterer = hdbscan.HDBSCAN(*args, | ||||
| #                                 **kwargs, | ||||
| #                                 ) | ||||
|      | ||||
| #     clustering = clusterer.fit(mat.astype('double')) | ||||
|      | ||||
| #     return(clustering) | ||||
|     Keword arguments: | ||||
|     savefile: path to save the metadata and diagnostics  | ||||
|     inpath: path to feather data containing a labeled matrix of subreddit similarities. | ||||
|     outpath: path to output fit kmeans clusterings. | ||||
|     min_cluster_sizes: one or more integers indicating the minumum cluster size | ||||
|     min_samples: one ore more integers indicating the minimum number of samples used in the algorithm | ||||
|     cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan | ||||
|     cluster_selection_method: "eom" or "leaf" eom gives larger clusters.  | ||||
|     """     | ||||
|     obj = hdbscan_grid_sweep(inpath, | ||||
|                              outpath, | ||||
|                              map(int,min_cluster_sizes), | ||||
|                              map(int,min_samples), | ||||
|                              map(float,cluster_selection_epsilons), | ||||
|                              map(float,cluster_selection_methods)) | ||||
|     obj.run() | ||||
|     obj.save(savefile) | ||||
| 
 | ||||
| def KNN_distances_plot(mat,outname,k=2): | ||||
|     nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) | ||||
| @ -293,10 +153,7 @@ def make_KNN_plots(): | ||||
|     KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     fire.Fire{'grid_sweep':hdbscan_grid_sweep, | ||||
|               'grid_sweep_lsi':hdbscan_lsi_grid_sweep | ||||
|               'cluster':hdbscan_job, | ||||
|               'cluster_lsi':hdbscan_lsi_job} | ||||
|     fire.Fire(run_hdbscan_grid_sweep) | ||||
|      | ||||
| #    test_select_hdbscan_clustering() | ||||
|     #fire.Fire(select_hdbscan_clustering)   | ||||
|  | ||||
							
								
								
									
										101
									
								
								clustering/hdbscan_clustering_lsi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								clustering/hdbscan_clustering_lsi.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,101 @@ | ||||
| from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result | ||||
| from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin | ||||
| from grid_sweep import grid_sweep | ||||
| import fire | ||||
| from dataclasses import dataclass | ||||
| 
 | ||||
| @dataclass | ||||
| class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): | ||||
|     pass  | ||||
| 
 | ||||
| class hdbscan_lsi_job(hdbscan_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): | ||||
|         super().__init__( | ||||
|                          infile, | ||||
|                          outpath, | ||||
|                          name, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| 
 | ||||
|     def get_info(self): | ||||
|         partial_result = super().get_info() | ||||
|         self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, | ||||
|                                                     lsi_dimensions=self.lsi_dims) | ||||
|         return self.result | ||||
| 
 | ||||
| class hdbscan_lsi_grid_sweep(lsi_grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  min_cluster_sizes, | ||||
|                  min_samples, | ||||
|                  cluster_selection_epsilons, | ||||
|                  cluster_selection_methods | ||||
|                  ): | ||||
| 
 | ||||
|         super().__init__(hdbscan_lsi_job, | ||||
|                          _hdbscan_lsi_grid_sweep, | ||||
|                          inpath, | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          min_cluster_sizes, | ||||
|                          min_samples, | ||||
|                          cluster_selection_epsilons, | ||||
|                          cluster_selection_methods) | ||||
|          | ||||
| 
 | ||||
| 
 | ||||
| class _hdbscan_lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
|                  lsi_dim, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
|         print(args) | ||||
|         print(kwargs) | ||||
| 
 | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = hdbscan_lsi_job | ||||
|         super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) | ||||
| 
 | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
|         s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
| 
 | ||||
| def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'],lsi_dimensions='all'): | ||||
|     """Run hdbscan clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
|     hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. | ||||
| 
 | ||||
|     Keword arguments: | ||||
|     savefile: path to save the metadata and diagnostics  | ||||
|     inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. | ||||
|     outpath: path to output fit clusterings. | ||||
|     min_cluster_sizes: one or more integers indicating the minumum cluster size | ||||
|     min_samples: one ore more integers indicating the minimum number of samples used in the algorithm | ||||
|     cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan | ||||
|     cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters.  | ||||
|     lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. | ||||
|     """     | ||||
| 
 | ||||
|     obj = hdbscan_lsi_grid_sweep(inpath, | ||||
|                                  lsi_dimensions, | ||||
|                                  outpath, | ||||
|                                  map(int,min_cluster_sizes), | ||||
|                                  map(int,min_samples), | ||||
|                                  map(float,cluster_selection_epsilons), | ||||
|                                  cluster_selection_methods | ||||
|                                  ) | ||||
| 
 | ||||
|     obj.run(10) | ||||
|     obj.save(savefile) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     fire.Fire(run_hdbscan_lsi_grid_sweep) | ||||
| @ -1,11 +1,9 @@ | ||||
| from sklearn.cluster import KMeans | ||||
| import fire | ||||
| from pathlib import Path | ||||
| from multiprocessing import cpu_count | ||||
| from dataclasses import dataclass | ||||
| from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat | ||||
| from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep | ||||
| 
 | ||||
| from clustering_base import clustering_result, clustering_job | ||||
| from grid_sweep import grid_sweep | ||||
| 
 | ||||
| @dataclass | ||||
| class kmeans_clustering_result(clustering_result): | ||||
| @ -13,10 +11,6 @@ class kmeans_clustering_result(clustering_result): | ||||
|     n_init:int | ||||
|     max_iter:int | ||||
| 
 | ||||
| @dataclass | ||||
| class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin): | ||||
|     pass | ||||
| 
 | ||||
| class kmeans_job(clustering_job): | ||||
|     def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): | ||||
|         super().__init__(infile, | ||||
| @ -45,28 +39,13 @@ class kmeans_job(clustering_job): | ||||
|     def get_info(self): | ||||
|         result = super().get_info() | ||||
|         self.result = kmeans_clustering_result(**result.__dict__, | ||||
|                                                n_init=n_init, | ||||
|                                                max_iter=max_iter) | ||||
|         return self.result | ||||
| 
 | ||||
| 
 | ||||
| class kmeans_lsi_job(kmeans_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): | ||||
|         super().__init__(infile, | ||||
|                          outpath, | ||||
|                          name, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| 
 | ||||
|     def get_info(self): | ||||
|         result = super().get_info() | ||||
|         self.result = kmeans_clustering_result_lsi(**result.__dict__, | ||||
|                                                    lsi_dimensions=self.lsi_dims) | ||||
|                                                n_init=self.n_init, | ||||
|                                                max_iter=self.max_iter) | ||||
|         return self.result | ||||
| 
 | ||||
| 
 | ||||
| class kmeans_grid_sweep(grid_sweep): | ||||
|         | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
| @ -80,49 +59,7 @@ class kmeans_grid_sweep(grid_sweep): | ||||
|              max_iter): | ||||
|         return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}" | ||||
| 
 | ||||
| class _kmeans_lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
|                  lsi_dim, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = kmeans_lsi_job | ||||
|         super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
|         s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
| 
 | ||||
| class kmeans_lsi_grid_sweep(lsi_grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  n_clusters, | ||||
|                  n_inits, | ||||
|                  max_iters | ||||
|                  ): | ||||
| 
 | ||||
|         super().__init__(kmeans_lsi_job, | ||||
|                          _kmeans_lsi_grid_sweep, | ||||
|                          inpath, | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          n_clusters, | ||||
|                          n_inits, | ||||
|                          max_iters) | ||||
| 
 | ||||
| def test_select_kmeans_clustering(): | ||||
|     # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", | ||||
|     #                           "test_hdbscan_author30k", | ||||
|     #                           min_cluster_sizes=[2], | ||||
|     #                           min_samples=[1,2], | ||||
|     #                           cluster_selection_epsilons=[0,0.05,0.1,0.15], | ||||
|     #                           cluster_selection_methods=['eom','leaf'], | ||||
|     #                           lsi_dimensions='all') | ||||
|     inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" | ||||
|     outpath = "test_kmeans"; | ||||
|     n_clusters=[200,300,400]; | ||||
| @ -139,10 +76,30 @@ def test_select_kmeans_clustering(): | ||||
|     gs.run(20) | ||||
|     gs.save("test_hdbscan/lsi_sweep.csv") | ||||
| 
 | ||||
| def run_kmeans_grid_sweep(savefile, inpath, outpath,  n_clusters=[500], n_inits=[1], max_iters=[3000]): | ||||
|     """Run kmeans clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
|     kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv> | ||||
| 
 | ||||
|     Keword arguments: | ||||
|     savefile: path to save the metadata and diagnostics  | ||||
|     inpath: path to feather data containing a labeled matrix of subreddit similarities. | ||||
|     outpath: path to output fit kmeans clusterings. | ||||
|     n_clusters: one or more numbers of kmeans clusters to select. | ||||
|     n_inits: one or more numbers of different initializations to use for each clustering. | ||||
|     max_iters: one or more numbers of different maximum interations.  | ||||
|     """     | ||||
| 
 | ||||
|     obj = kmeans_grid_sweep(inpath, | ||||
|                             outpath, | ||||
|                             map(int,n_clusters), | ||||
|                             map(int,n_inits), | ||||
|                             map(int,max_iters)) | ||||
| 
 | ||||
| 
 | ||||
|     obj.run(1) | ||||
|     obj.save(savefile) | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
| 
 | ||||
|     fire.Fire{'grid_sweep':kmeans_grid_sweep, | ||||
|               'grid_sweep_lsi':kmeans_lsi_grid_sweep | ||||
|               'cluster':kmeans_job, | ||||
|               'cluster_lsi':kmeans_lsi_job} | ||||
|     fire.Fire(run_kmeans_grid_sweep) | ||||
|  | ||||
							
								
								
									
										93
									
								
								clustering/kmeans_clustering_lsi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								clustering/kmeans_clustering_lsi.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,93 @@ | ||||
| import fire | ||||
| from dataclasses import dataclass | ||||
| from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep | ||||
| from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep | ||||
| from grid_sweep import grid_sweep | ||||
| 
 | ||||
| @dataclass | ||||
| class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin): | ||||
|     pass | ||||
| 
 | ||||
| class kmeans_lsi_job(kmeans_job, lsi_mixin): | ||||
|     def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): | ||||
|         super().__init__(infile, | ||||
|                          outpath, | ||||
|                          name, | ||||
|                          *args, | ||||
|                          **kwargs) | ||||
|         super().set_lsi_dims(lsi_dims) | ||||
| 
 | ||||
|     def get_info(self): | ||||
|         result = super().get_info() | ||||
|         self.result = kmeans_clustering_result_lsi(**result.__dict__, | ||||
|                                                    lsi_dimensions=self.lsi_dims) | ||||
|         return self.result | ||||
| 
 | ||||
| class _kmeans_lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  outpath, | ||||
|                  lsi_dim, | ||||
|                  *args, | ||||
|                  **kwargs): | ||||
|         print(args) | ||||
|         print(kwargs) | ||||
|         self.lsi_dim = lsi_dim | ||||
|         self.jobtype = kmeans_lsi_job | ||||
|         super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) | ||||
| 
 | ||||
|     def namer(self, *args, **kwargs): | ||||
|         s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) | ||||
|         s += f"_lsi-{self.lsi_dim}" | ||||
|         return s | ||||
| 
 | ||||
| class kmeans_lsi_grid_sweep(lsi_grid_sweep): | ||||
| 
 | ||||
|     def __init__(self, | ||||
|                  inpath, | ||||
|                  lsi_dims, | ||||
|                  outpath, | ||||
|                  n_clusters, | ||||
|                  n_inits, | ||||
|                  max_iters | ||||
|                  ): | ||||
| 
 | ||||
|         super().__init__(kmeans_lsi_job, | ||||
|                          _kmeans_lsi_grid_sweep, | ||||
|                          inpath, | ||||
|                          lsi_dims, | ||||
|                          outpath, | ||||
|                          n_clusters, | ||||
|                          n_inits, | ||||
|                          max_iters) | ||||
| 
 | ||||
| def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath,  n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"): | ||||
|     """Run kmeans clustering once or more with different parameters. | ||||
|      | ||||
|     Usage: | ||||
|     kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv> | ||||
| 
 | ||||
|     Keword arguments: | ||||
|     savefile: path to save the metadata and diagnostics  | ||||
|     inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. | ||||
|     outpath: path to output fit kmeans clusterings. | ||||
|     lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. | ||||
|     n_clusters: one or more numbers of kmeans clusters to select. | ||||
|     n_inits: one or more numbers of different initializations to use for each clustering. | ||||
|     max_iters: one or more numbers of different maximum interations.  | ||||
|     """     | ||||
| 
 | ||||
|     obj = kmeans_lsi_grid_sweep(inpath, | ||||
|                                 lsi_dimensions, | ||||
|                                 outpath, | ||||
|                                 list(map(int,n_clusters)), | ||||
|                                 list(map(int,n_inits)), | ||||
|                                 list(map(int,max_iters)) | ||||
|                                 ) | ||||
| 
 | ||||
|     obj.run(1) | ||||
|     obj.save(savefile) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     fire.Fire(run_kmeans_lsi_grid_sweep) | ||||
							
								
								
									
										28
									
								
								clustering/lsi_base.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								clustering/lsi_base.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,28 @@ | ||||
| from clustering_base import clustering_job, clustering_result | ||||
| from grid_sweep import grid_sweep | ||||
| from dataclasses import dataclass | ||||
| from itertools import chain | ||||
| from pathlib import Path | ||||
| 
 | ||||
| class lsi_mixin(): | ||||
|     def set_lsi_dims(self, lsi_dims): | ||||
|         self.lsi_dims = lsi_dims | ||||
| 
 | ||||
| @dataclass | ||||
| class lsi_result_mixin: | ||||
|     lsi_dimensions:int | ||||
| 
 | ||||
| class lsi_grid_sweep(grid_sweep): | ||||
|     def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs): | ||||
|         self.jobtype = jobtype | ||||
|         self.subsweep = subsweep | ||||
|         inpath = Path(inpath) | ||||
|         if lsi_dimensions == 'all': | ||||
|             lsi_paths = list(inpath.glob("*")) | ||||
|         else: | ||||
|             lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] | ||||
| 
 | ||||
|         lsi_nums = [p.stem for p in lsi_paths] | ||||
|         self.hasrun = False | ||||
|         self.subgrids = [self.subsweep(lsi_path, outpath,  lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] | ||||
|         self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user