update clustering scripts

2021-08-03 14:55:02 -07:00
parent 87ffaa6858
commit cf86c7492c
11 changed files with 73 additions and 27 deletions
--- a/clustering/Makefile
+++ b/clustering/Makefile
@@ -2,9 +2,9 @@
 srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
 similarity_data=/gscratch/comdata/output/reddit_similarity
 clustering_data=/gscratch/comdata/output/reddit_clustering
-kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
-hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
-affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]"
+kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
+hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
+affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]

 authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
 authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
@@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu
 ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)

+${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
+	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2

+${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
+	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2

 clean_affinity:
 	rm -f ${authors_10k_output}/affinity/selection_data.csv
--- a/clustering/grid_sweep.py
+++ b/clustering/grid_sweep.py
@@ -7,6 +7,7 @@ class grid_sweep:
    def __init__(self, jobtype, inpath, outpath, namer, *args):
        self.jobtype = jobtype
        self.namer = namer
+        print(*args)
        grid = list(product(*args))
        inpath = Path(inpath)
        outpath = Path(outpath)
--- a/clustering/hdbscan_clustering_lsi.py
+++ b/clustering/hdbscan_clustering_lsi.py
@@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep):

        self.lsi_dim = lsi_dim
        self.jobtype = hdbscan_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)


    def namer(self, *args, **kwargs):
@@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2]
    obj = hdbscan_lsi_grid_sweep(inpath,
                                 lsi_dimensions,
                                 outpath,
-                                 map(int,min_cluster_sizes),
-                                 map(int,min_samples),
-                                 map(float,cluster_selection_epsilons),
+                                 list(map(int,min_cluster_sizes)),
+                                 list(map(int,min_samples)),
+                                 list(map(float,cluster_selection_epsilons)),
                                 cluster_selection_methods
                                 )

--- a/clustering/kmeans_clustering_lsi.py
+++ b/clustering/kmeans_clustering_lsi.py
@@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep):
        print(kwargs)
        self.lsi_dim = lsi_dim
        self.jobtype = kmeans_lsi_job
-        super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
+        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)

    def namer(self, *args, **kwargs):
        s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
--- a/clustering/pick_best_clustering.py
+++ b/clustering/pick_best_clustering.py
@@ -2,15 +2,15 @@ import fire
 import pandas as pd
 from pathlib import Path
 import shutil
-
-selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv"
+selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv"

 outpath = 'test_best.feather'
+min_clusters=50; max_isolates=5000; min_cluster_size=2

 # pick the best clustering according to silhouette score subject to contraints
-def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
+def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
    df = pd.read_csv(selection_data,index_col=0)
-    df = df.sort_values("silhouette_score")
+    df = df.sort_values("silhouette_score",ascending=False)

    # not sure I fixed the bug underlying this fully or not.
    df['n_isolates_str'] = df.n_isolates.str.strip("[]")
@@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
    df.loc[df.n_isolates_0,'n_isolates'] = 0
    df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
    
-    best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]]
+    best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]]

    print(best_cluster.to_dict())
    best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
-    
    shutil.copy(best_path,output)

 if __name__ == "__main__":
--- a/clustering/selection.py
+++ b/clustering/selection.py
@@ -1,7 +1,38 @@
-import fire
-from select_affinity import select_affinity_clustering
-from select_kmeans import select_kmeans_clustering
+import pandas as pd
+import plotnine as pn
+from pathlib import Path
+from clustering.fit_tsne import fit_tsne
+from visualization.tsne_vis import build_visualization
+
+df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
+
+# plot silhouette_score as a function of isolates
+df = df.sort_values("silhouette_score")
+
+df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
+p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
+p.save("isolates_x_score.png")
+
+p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
+p.save("clusters_x_isolates.png")
+
+# the best result for hdbscan seems like this one: it has a decent number of 
+# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
+best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
+
+best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
+
+tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
+
+if not tnse_data.exists():
+    fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
+             tnse_data)
+
+build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
+                    Path(best_eom.outpath)/(best_eom['name']+'.feather'),
+                    "./authors-tf_lsi850_best_eom.html")
+
+build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
+                    Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
+                    "./authors-tf_lsi850_best_leaf.html")

-if __name__ == "__main__":
-    fire.Fire({"kmeans":select_kmeans_clustering,
-               "affinity":select_affinity_clustering})