Use Latent semantic indexing and hdbscan

2021-05-02 23:39:55 -07:00
parent 36b24ee933
commit 7df8436067
14 changed files with 835 additions and 373 deletions
--- a/clustering/Makefile
+++ b/clustering/Makefile
@@ -2,20 +2,41 @@
 srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
 similarity_data=/gscratch/comdata/output/reddit_similarity
 clustering_data=/gscratch/comdata/output/reddit_clustering
-selection_grid="--max_iter=3000 --convergence_iter=15,30,100 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9"
+kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]"
 #selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
-all:$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv
+all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
 # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS

-$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(selection_grid) -J 20
+$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
+	$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid)

-$(clustering_data)/subreddit_comment_terms_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k  $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv $(selection_grid) -J 20 
+$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
+	$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans  $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid)

-$(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k  $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(selection_grid) -J 20
+$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
+	$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans  $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
+
+
+affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
+$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
+	$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
+
+$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
+	$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity  $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 
+
+$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
+	$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity  $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
+
+clean:
+	rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
+	rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv
+	rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv
+	rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv
+	rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv
+	rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv
+
+PHONY: clean

 # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
--- a/clustering/clustering.py
+++ b/clustering/clustering.py
@@ -3,24 +3,23 @@
 import sys
 import pandas as pd
 import numpy as np
-from sklearn.cluster import AffinityPropagation
+from sklearn.cluster import AffinityPropagation, KMeans
 import fire
 from pathlib import Path
+from multiprocessing import cpu_count
+from dataclasses import dataclass
+from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat

-def read_similarity_mat(similarities, use_threads=True):
-    df = pd.read_feather(similarities, use_threads=use_threads)
-    mat = np.array(df.drop('_subreddit',1))
-    n = mat.shape[0]
-    mat[range(n),range(n)] = 1
-    return (df._subreddit,mat)
-
-def affinity_clustering(similarities, *args, **kwargs):
+def affinity_clustering(similarities, output, *args, **kwargs):
    subreddits, mat = read_similarity_mat(similarities)
-    return _affinity_clustering(mat, subreddits, *args, **kwargs)
+    clustering = _affinity_clustering(mat, *args, **kwargs)
+    cluster_data = process_clustering_result(clustering, subreddits)
+    cluster_data['algorithm'] = 'affinity'
+    return(cluster_data)

 def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
    '''
-    similarities: feather file with a dataframe of similarity scores
+    similarities: matrix of similarity scores
    preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
    damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. 
    '''
@@ -40,25 +39,32 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000,
                                     verbose=verbose,
                                     random_state=random_state).fit(mat)

-
-    print(f"clustering took {clustering.n_iter_} iterations")
-    clusters = clustering.labels_
-
-    print(f"found {len(set(clusters))} clusters")
-
-    cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
-
-    cluster_sizes = cluster_data.groupby("cluster").count()
-    print(f"the largest cluster has {cluster_sizes.subreddit.max()} members")
-
-    print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
-
-    print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
-
-    sys.stdout.flush()
+    cluster_data = process_clustering_result(clustering, subreddits)
+    output = Path(output)
+    output.parent.mkdir(parents=True,exist_ok=True)
    cluster_data.to_feather(output)
    print(f"saved {output}")
    return clustering

+def kmeans_clustering(similarities, *args, **kwargs):
+    subreddits, mat = read_similarity_mat(similarities)
+    mat = sim_to_dist(mat)
+    clustering = _kmeans_clustering(mat, *args, **kwargs)
+    cluster_data = process_clustering_result(clustering, subreddits)
+    return(cluster_data)
+
+def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
+
+    clustering = KMeans(n_clusters=n_clusters,
+                        n_init=n_init,
+                        max_iter=max_iter,
+                        random_state=random_state,
+                        verbose=verbose
+                        ).fit(mat)
+
+    return clustering
+
+
+
 if __name__ == "__main__":
    fire.Fire(affinity_clustering)
--- a/clustering/clustering_base.py
+++ b/clustering/clustering_base.py
@@ -0,0 +1,49 @@
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from dataclasses import dataclass
+
+def sim_to_dist(mat):
+    dist = 1-mat
+    dist[dist < 0] = 0
+    np.fill_diagonal(dist,0)
+    return dist
+
+def process_clustering_result(clustering, subreddits):
+
+    if hasattr(clustering,'n_iter_'):
+        print(f"clustering took {clustering.n_iter_} iterations")
+
+    clusters = clustering.labels_
+
+    print(f"found {len(set(clusters))} clusters")
+
+    cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
+
+    cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
+    print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
+
+    print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
+
+    print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
+
+    print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True)
+
+    return cluster_data
+
+
+@dataclass
+class clustering_result:
+    outpath:Path
+    max_iter:int
+    silhouette_score:float
+    alt_silhouette_score:float
+    name:str
+    n_clusters:int
+
+def read_similarity_mat(similarities, use_threads=True):
+    df = pd.read_feather(similarities, use_threads=use_threads)
+    mat = np.array(df.drop('_subreddit',1))
+    n = mat.shape[0]
+    mat[range(n),range(n)] = 1
+    return (df._subreddit,mat)
--- a/clustering/hdbscan_clustering.py
+++ b/clustering/hdbscan_clustering.py
@@ -0,0 +1,172 @@
+from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
+from dataclasses import dataclass
+import hdbscan
+from sklearn.neighbors import NearestNeighbors
+import plotnine as pn
+import numpy as np
+from itertools import product, starmap
+import pandas as pd
+from sklearn.metrics import silhouette_score, silhouette_samples
+from pathlib import Path
+from multiprocessing import Pool, cpu_count
+import fire
+from pyarrow.feather import write_feather
+
+def test_select_hdbscan_clustering():
+    select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
+                              "test_hdbscan_author30k",
+                              min_cluster_sizes=[2],
+                              min_samples=[1,2],
+                              cluster_selection_epsilons=[0,0.05,0.1,0.15],
+                              cluster_selection_methods=['eom','leaf'],
+                              lsi_dimensions='all')
+    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI"
+    outpath = "test_hdbscan";
+    min_cluster_sizes=[2,3,4];
+    min_samples=[1,2,3];
+    cluster_selection_epsilons=[0,0.1,0.3,0.5];
+    cluster_selection_methods=['eom'];
+    lsi_dimensions='all'
+
+@dataclass
+class hdbscan_clustering_result(clustering_result):
+    min_cluster_size:int
+    min_samples:int
+    cluster_selection_epsilon:float
+    cluster_selection_method:str
+    lsi_dimensions:int
+    n_isolates:int
+    silhouette_samples:str
+
+def select_hdbscan_clustering(inpath,
+                              outpath,
+                              outfile=None,
+                              min_cluster_sizes=[2],
+                              min_samples=[1],
+                              cluster_selection_epsilons=[0],
+                              cluster_selection_methods=['eom'],
+                              lsi_dimensions='all'
+                              ):
+
+    inpath = Path(inpath)
+    outpath = Path(outpath)
+    outpath.mkdir(exist_ok=True, parents=True)
+    
+    if lsi_dimensions == 'all':
+        lsi_paths = list(inpath.glob("*"))
+
+    else:
+        lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
+
+    lsi_nums = [p.stem for p in lsi_paths]
+    grid = list(product(lsi_nums,
+                        min_cluster_sizes,
+                        min_samples,
+                        cluster_selection_epsilons,
+                        cluster_selection_methods))
+
+    # fix the output file names
+    names = list(map(lambda t:'_'.join(map(str,t)),grid))
+
+    grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
+        
+    with Pool(int(cpu_count()/4)) as pool:
+        mods = starmap(hdbscan_clustering, grid)
+
+    res = pd.DataFrame(mods)
+    if outfile is None:
+        outfile = outpath / "selection_data.csv"
+
+    res.to_csv(outfile)
+
+def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
+    subreddits, mat = read_similarity_mat(similarities)
+    mat = sim_to_dist(mat)
+    clustering = _hdbscan_clustering(mat,
+                                     min_cluster_size=min_cluster_size,
+                                     min_samples=min_samples,
+                                     cluster_selection_epsilon=cluster_selection_epsilon,
+                                     cluster_selection_method=cluster_selection_method,
+                                     metric='precomputed',
+                                     core_dist_n_jobs=cpu_count()
+                                     )
+
+    cluster_data = process_clustering_result(clustering, subreddits)
+    isolates = clustering.labels_ == -1
+    scoremat = mat[~isolates][:,~isolates]
+    score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
+    cluster_data.to_feather(output)
+
+    silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
+    silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
+    silsampout = output.parent / ("silhouette_samples" + output.name)
+    silhouette_samp.to_feather(silsampout)
+
+    result = hdbscan_clustering_result(outpath=output,
+                                       max_iter=None,
+                                       silhouette_samples=silsampout,
+                                       silhouette_score=score,
+                                       alt_silhouette_score=score,
+                                       name=name,
+                                       min_cluster_size=min_cluster_size,
+                                       min_samples=min_samples,
+                                       cluster_selection_epsilon=cluster_selection_epsilon,
+                                       cluster_selection_method=cluster_selection_method,
+                                       lsi_dimensions=lsi_dim,
+                                       n_isolates=isolates.sum(),
+                                       n_clusters=len(set(clustering.labels_))
+                                   )
+
+
+                                       
+    return(result)
+
+# for all runs we should try cluster_selection_epsilon = None
+# for terms we should try cluster_selection_epsilon around 0.56-0.66
+# for authors we should try cluster_selection_epsilon around 0.98-0.99
+def _hdbscan_clustering(mat, *args, **kwargs):
+    print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
+
+    print(mat)
+    clusterer = hdbscan.HDBSCAN(*args,
+                                **kwargs,
+                                )
+    
+    clustering = clusterer.fit(mat.astype('double'))
+    
+    return(clustering)
+
+def KNN_distances_plot(mat,outname,k=2):
+    nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
+    distances, indices = nbrs.kneighbors(mat)
+    d2 = distances[:,-1]
+    df = pd.DataFrame({'dist':d2})
+    df = df.sort_values("dist",ascending=False)
+    df['idx'] = np.arange(0,d2.shape[0]) + 1
+    p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
+                                                                                      breaks = np.arange(0,10)/10)
+    p.save(outname,width=16,height=10)
+    
+def make_KNN_plots():
+    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
+    subreddits, mat = read_similarity_mat(similarities)
+    mat = sim_to_dist(mat)
+
+    KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
+
+    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
+    subreddits, mat = read_similarity_mat(similarities)
+    mat = sim_to_dist(mat)
+    KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
+
+    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
+    subreddits, mat = read_similarity_mat(similarities)
+    mat = sim_to_dist(mat)
+    KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
+
+if __name__ == "__main__":
+    df = pd.read_csv("test_hdbscan/selection_data.csv")
+    test_select_hdbscan_clustering()
+    check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
+    silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
+    c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
--- a/clustering/select_affinity.py
+++ b/clustering/select_affinity.py
@@ -1,8 +1,8 @@
 from sklearn.metrics import silhouette_score
 from sklearn.cluster import AffinityPropagation
 from functools import partial
-from clustering import _affinity_clustering, read_similarity_mat
 from dataclasses import dataclass
+from clustering import _affinity_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result
 from multiprocessing  import Pool, cpu_count, Array, Process
 from pathlib import Path
 from itertools import product, starmap
@@ -12,40 +12,69 @@ import fire
 import sys

 # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
-
@dataclass
-class clustering_result:
-    outpath:Path
+class affinity_clustering_result(clustering_result):
    damping:float
-    max_iter:int
    convergence_iter:int
    preference_quantile:float
-    silhouette_score:float
-    alt_silhouette_score:float
-    name:str

-
-def sim_to_dist(mat):
-    dist = 1-mat
-    dist[dist < 0] = 0
-    np.fill_diagonal(dist,0)
-    return dist
-
-def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits,  max_iter,  outdir:Path, random_state, verbose, alt_mat, overwrite=False):
+def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits,  max_iter,  outdir:Path, random_state, verbose, alt_mat, overwrite=False):
    if name is None:
        name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
    print(name)
    sys.stdout.flush()
    outpath = outdir / (str(name) + ".feather")
+    outpath.parent.mkdir(parents=True,exist_ok=True)
+    print(outpath)
+    clustering = _affinity_clustering(mat, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
+    cluster_data = process_clustering_result(clustering, subreddits)
+    mat = sim_to_dist(clustering.affinity_matrix_)
+
+    try: 
+        score = silhouette_score(mat, clustering.labels_, metric='precomputed')
+    except ValueError:
+        score = None
+
+    if alt_mat is not None:
+        alt_distances = sim_to_dist(alt_mat)
+        try:
+            alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
+        except ValueError:
+            alt_score = None
+    
+    res = affinity_clustering_result(outpath=outpath,
+                                     damping=damping,
+                                     max_iter=max_iter,
+                                     convergence_iter=convergence_iter,
+                                     preference_quantile=preference_quantile,
+                                     silhouette_score=score,
+                                     alt_silhouette_score=score,
+                                     name=str(name))
+
+    return res
+
+def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits,  max_iter,  outdir:Path, random_state, verbose, alt_mat, overwrite=False):
+    if name is None:
+        name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
+    print(name)
+    sys.stdout.flush()
+    outpath = outdir / (str(name) + ".feather")
+    outpath.parent.mkdir(parents=True,exist_ok=True)
    print(outpath)
    clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
    mat = sim_to_dist(clustering.affinity_matrix_)

-    score = silhouette_score(mat, clustering.labels_, metric='precomputed')
+    try: 
+        score = silhouette_score(mat, clustering.labels_, metric='precomputed')
+    except ValueError:
+        score = None

    if alt_mat is not None:
        alt_distances = sim_to_dist(alt_mat)
-        alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
+        try:
+            alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
+        except ValueError:
+            alt_score = None
    
    res = clustering_result(outpath=outpath,
                            damping=damping,
@@ -58,6 +87,7 @@ def do_clustering(damping, convergence_iter, preference_quantile, name, mat, sub

    return res

+
 # alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).

 def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):
@@ -86,7 +116,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max
    hyper_grid = product(damping, convergence_iter, preference_quantile)
    hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))

-    _do_clustering = partial(do_clustering,  mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
+    _do_clustering = partial(do_affinity_clustering,  mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)

    #    similarities = Array('d', mat)
    # call pool.starmap
@@ -94,6 +124,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max
    clustering_data = pool.starmap(_do_clustering, hyper_grid)
    clustering_data = pd.DataFrame(list(clustering_data))
    clustering_data.to_csv(outinfo)
+
    
    return clustering_data

--- a/clustering/select_kmeans.py
+++ b/clustering/select_kmeans.py
@@ -0,0 +1,92 @@
+from sklearn.metrics import silhouette_score
+from sklearn.cluster import AffinityPropagation
+from functools import partial
+from clustering import _kmeans_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result
+from dataclasses import dataclass
+from multiprocessing  import Pool, cpu_count, Array, Process
+from pathlib import Path
+from itertools import product, starmap
+import numpy as np
+import pandas as pd
+import fire
+import sys
+
+@dataclass
+class kmeans_clustering_result(clustering_result):
+    n_clusters:int
+    n_init:int
+
+
+# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
+
+def do_clustering(n_clusters, n_init, name, mat, subreddits,  max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
+    if name is None:
+        name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
+    print(name)
+    sys.stdout.flush()
+    outpath = outdir / (str(name) + ".feather")
+    print(outpath)
+    mat = sim_to_dist(mat)
+    clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose)
+
+    outpath.parent.mkdir(parents=True,exist_ok=True)
+    cluster_data.to_feather(outpath)
+    cluster_data = process_clustering_result(clustering, subreddits)
+
+    try: 
+        score = silhouette_score(mat, clustering.labels_, metric='precomputed')
+    except ValueError:
+        score = None
+
+    if alt_mat is not None:
+        alt_distances = sim_to_dist(alt_mat)
+        try:
+            alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
+        except ValueError:
+            alt_score = None
+    
+    res = kmeans_clustering_result(outpath=outpath,
+                                   max_iter=max_iter,
+                                   n_clusters=n_clusters,
+                                   n_init = n_init,
+                                   silhouette_score=score,
+                                   alt_silhouette_score=score,
+                                   name=str(name))
+
+    return res
+
+
+# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
+def select_kmeans_clustering(similarities, outdir, outinfo, n_clusters=[1000], max_iter=100000, n_init=10, random_state=1968, verbose=True, alt_similarities=None):
+
+    n_clusters = list(map(int,n_clusters))
+    n_init  = list(map(int,n_init))
+
+    if type(outdir) is str:
+        outdir = Path(outdir)
+
+    outdir.mkdir(parents=True,exist_ok=True)
+
+    subreddits, mat = read_similarity_mat(similarities,use_threads=True)
+
+    if alt_similarities is not None:
+        alt_mat = read_similarity_mat(alt_similarities,use_threads=True)
+    else:
+        alt_mat = None
+
+    # get list of tuples: the combinations of hyperparameters
+    hyper_grid = product(n_clusters, n_init)
+    hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))
+
+    _do_clustering = partial(do_clustering,  mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
+
+    # call starmap
+    print("running clustering selection")
+    clustering_data = starmap(_do_clustering, hyper_grid)
+    clustering_data = pd.DataFrame(list(clustering_data))
+    clustering_data.to_csv(outinfo)
+    
+    return clustering_data
+
+if __name__ == "__main__":
+    x = fire.Fire(select_kmeans_clustering)
--- a/clustering/selection.py
+++ b/clustering/selection.py
@@ -0,0 +1,7 @@
+import fire
+from select_affinity import select_affinity_clustering
+from select_kmeans import select_kmeans_clustering
+
+if __name__ == "__main__":
+    fire.Fire({"kmeans":select_kmeans_clustering,
+               "affinity":select_affinity_clustering})