changes from dirty branch.
This commit is contained in:
@@ -4,7 +4,7 @@ similarity_data=/gscratch/comdata/output/reddit_similarity
|
||||
clustering_data=/gscratch/comdata/output/reddit_clustering
|
||||
kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
|
||||
|
||||
umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10]
|
||||
umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10,15,25]
|
||||
|
||||
hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
|
||||
affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
|
||||
|
||||
@@ -21,9 +21,9 @@ class clustering_job:
|
||||
self.subreddits, self.mat = self.read_distance_mat(self.infile)
|
||||
self.clustering = self.call(self.mat, *self.args, **self.kwargs)
|
||||
self.cluster_data = self.process_clustering(self.clustering, self.subreddits)
|
||||
self.score = self.silhouette()
|
||||
self.outpath.mkdir(parents=True, exist_ok=True)
|
||||
self.cluster_data.to_feather(self.outpath/(self.name + ".feather"))
|
||||
|
||||
self.hasrun = True
|
||||
self.cleanup()
|
||||
|
||||
@@ -62,6 +62,7 @@ class clustering_job:
|
||||
else:
|
||||
score = None
|
||||
self.silsampout = None
|
||||
|
||||
return score
|
||||
|
||||
def read_distance_mat(self, similarities, use_threads=True):
|
||||
@@ -81,9 +82,13 @@ class clustering_job:
|
||||
self.n_clusters = len(set(clusters))
|
||||
|
||||
print(f"found {self.n_clusters} clusters")
|
||||
|
||||
cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
|
||||
|
||||
|
||||
self.score = self.silhouette()
|
||||
print(f"silhouette_score:{self.score}")
|
||||
|
||||
|
||||
cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
|
||||
print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
|
||||
|
||||
@@ -125,7 +130,7 @@ class twoway_clustering_job(clustering_job):
|
||||
self.after_run()
|
||||
self.cleanup()
|
||||
|
||||
def after_run():
|
||||
def after_run(self):
|
||||
self.score = self.silhouette()
|
||||
self.outpath.mkdir(parents=True, exist_ok=True)
|
||||
print(self.outpath/(self.name+".feather"))
|
||||
|
||||
@@ -110,7 +110,7 @@ class umap_hdbscan_job(twoway_clustering_job):
|
||||
self.cluster_selection_method = hdbscan_args['cluster_selection_method']
|
||||
|
||||
def after_run(self):
|
||||
coords = self.step1.emedding_
|
||||
coords = self.step1.embedding_
|
||||
self.cluster_data['x'] = coords[:,0]
|
||||
self.cluster_data['y'] = coords[:,1]
|
||||
super().after_run()
|
||||
|
||||
Reference in New Issue
Block a user