13
0

Refactor to make a decent api.

This commit is contained in:
Nate E TeBlunthuis 2021-05-10 13:46:49 -07:00
parent f05cb962e0
commit 4cb7eeec80
10 changed files with 591 additions and 408 deletions

View File

@ -2,41 +2,160 @@
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
similarity_data=/gscratch/comdata/output/reddit_similarity similarity_data=/gscratch/comdata/output/reddit_similarity
clustering_data=/gscratch/comdata/output/reddit_clustering clustering_data=/gscratch/comdata/output/reddit_clustering
kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]" kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]"
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid) authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k
authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI
$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid) authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k
authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid) terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI
terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k
terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI
all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi
terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv
authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv
authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv
terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv
authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv
authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py
$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py
$(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py
$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py
$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py
$(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py
$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" ## LSI Models
$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py ${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py ${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
clean: ${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv
PHONY: clean ${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py
$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
clean_affinity:
rm -f ${authors_10k_output}/affinity/selection_data.csv
rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
rm -f ${terms_10k_output}/affinity/selection_data.csv
clean_kmeans:
rm -f ${authors_10k_output}/kmeans/selection_data.csv
rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
rm -f ${terms_10k_output}/kmeans/selection_data.csv
clean_hdbscan:
rm -f ${authors_10k_output}/hdbscan/selection_data.csv
rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
rm -f ${terms_10k_output}/hdbscan/selection_data.csv
clean_authors:
rm -f ${authors_10k_output}/affinity/selection_data.csv
rm -f ${authors_10k_output}/kmeans/selection_data.csv
rm -f ${authors_10k_output}/hdbscan/selection_data.csv
clean_authors_tf:
rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
clean_terms:
rm -f ${terms_10k_output}/affinity/selection_data.csv
rm -f ${terms_10k_output}/kmeans/selection_data.csv
rm -f ${terms_10k_output}/hdbscan/selection_data.csv
clean_lsi_affinity:
rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
clean_lsi_kmeans:
rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
clean_lsi_hdbscan:
rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
clean_lsi_authors:
rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
clean_lsi_authors_tf:
rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
clean_lsi_terms:
rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
clean: clean_affinity clean_kmeans clean_hdbscan
PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS

View File

@ -1,16 +1,12 @@
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation from sklearn.cluster import AffinityPropagation
from functools import partial
from dataclasses import dataclass from dataclasses import dataclass
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat from clustering_base import clustering_result, clustering_job
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep from grid_sweep import grid_sweep
from multiprocessing import Pool, cpu_count, Array, Process
from pathlib import Path from pathlib import Path
from itertools import product, starmap from itertools import product, starmap
import numpy as np
import pandas as pd
import fire import fire
import sys import sys
import numpy as np
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
@dataclass @dataclass
@ -21,10 +17,6 @@ class affinity_clustering_result(clustering_result):
preference:float preference:float
max_iter:int max_iter:int
@dataclass
class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
pass
class affinity_job(clustering_job): class affinity_job(clustering_job):
def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
super().__init__(infile, super().__init__(infile,
@ -67,21 +59,6 @@ class affinity_job(clustering_job):
return self.result return self.result
class affinity_lsi_job(affinity_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
result = super().get_info()
self.result = affinity_clustering_result_lsi(**result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class affinity_grid_sweep(grid_sweep): class affinity_grid_sweep(grid_sweep):
def __init__(self, def __init__(self,
inpath, inpath,
@ -104,49 +81,29 @@ class affinity_grid_sweep(grid_sweep):
return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
class _affinity_lsi_grid_sweep(grid_sweep): def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]):
def __init__(self, """Run affinity clustering once or more with different parameters.
inpath,
Usage:
affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv>
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to feather data containing a labeled matrix of subreddit similarities.
outpath: path to output fit kmeans clusterings.
dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
convergence_iters:one or more integers of number of iterations without improvement before stopping.
max_iters: one or more numbers of different maximum interations.
"""
obj = affinity_grid_sweep(inpath,
outpath, outpath,
lsi_dim, map(float,dampings),
*args, map(int,max_iters),
**kwargs): map(int,convergence_iters),
self.lsi_dim = lsi_dim map(float,preference_quantiles))
self.jobtype = affinity_lsi_job obj.run(1)
super().__init__(self.jobtype, obj.save(savefile)
inpath,
outpath,
self.namer,
self.lsi_dim,
*args,
**kwargs)
def namer(self, *args, **kwargs):
s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
class affinity_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
dampings=[0.9],
max_iters=[10000],
convergence_iters=[30],
preference_quantiles=[0.5]):
super().__init__(affinity_lsi_job,
_affinity_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
dampings,
max_iters,
convergence_iters,
preference_quantiles)
def test_select_affinity_clustering(): def test_select_affinity_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
@ -169,7 +126,4 @@ def test_select_affinity_clustering():
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire{'grid_sweep':affinity_grid_sweep, fire.Fire(run_affinity_grid_sweep)
'grid_sweep_lsi':affinity_lsi_grid_sweep
'cluster':affinity_job,
'cluster_lsi':affinity_lsi_job}

View File

@ -0,0 +1,99 @@
import fire
from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
from grid_sweep import grid_sweep
from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
from dataclasses import dataclass
@dataclass
class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
pass
class affinity_lsi_job(affinity_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
result = super().get_info()
self.result = affinity_clustering_result_lsi(**result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class affinity_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
dampings=[0.9],
max_iters=[10000],
convergence_iters=[30],
preference_quantiles=[0.5]):
super().__init__(affinity_lsi_job,
_affinity_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
dampings,
max_iters,
convergence_iters,
preference_quantiles)
class _affinity_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
self.lsi_dim = lsi_dim
self.jobtype = affinity_lsi_job
super().__init__(self.jobtype,
inpath,
outpath,
self.namer,
self.lsi_dim,
*args,
**kwargs)
def namer(self, *args, **kwargs):
s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'):
"""Run affinity clustering once or more with different parameters.
Usage:
affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
outpath: path to output fit kmeans clusterings.
dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
convergence_iters:one or more integers of number of iterations without improvement before stopping.
max_iters: one or more numbers of different maximum interations.
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
"""
obj = affinity_lsi_grid_sweep(inpath,
lsi_dimensions,
outpath,
map(float,dampings),
map(int,max_iters),
map(int,convergence_iters),
map(float,preference_quantiles))
obj.run(1)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_affinity_lsi_grid_sweep)

View File

@ -3,59 +3,6 @@ import numpy as np
import pandas as pd import pandas as pd
from dataclasses import dataclass from dataclasses import dataclass
from sklearn.metrics import silhouette_score, silhouette_samples from sklearn.metrics import silhouette_score, silhouette_samples
from itertools import product, chain
from multiprocessing import Pool, cpu_count
def sim_to_dist(mat):
dist = 1-mat
dist[dist < 0] = 0
np.fill_diagonal(dist,0)
return dist
class grid_sweep:
def __init__(self, jobtype, inpath, outpath, namer, *args):
self.jobtype = jobtype
self.namer = namer
grid = list(product(*args))
inpath = Path(inpath)
outpath = Path(outpath)
self.hasrun = False
self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
self.jobs = [jobtype(*g) for g in self.grid]
def run(self, cores=20):
if cores is not None and cores > 1:
with Pool(cores) as pool:
infos = pool.map(self.jobtype.get_info, self.jobs)
else:
infos = map(self.jobtype.get_info, self.jobs)
self.infos = pd.DataFrame(infos)
self.hasrun = True
def save(self, outcsv):
if not self.hasrun:
self.run()
outcsv = Path(outcsv)
outcsv.parent.mkdir(parents=True, exist_ok=True)
self.infos.to_csv(outcsv)
class lsi_grid_sweep(grid_sweep):
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
self.jobtype = jobtype
self.subsweep = subsweep
inpath = Path(inpath)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*"))
else:
lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
lsi_nums = [p.stem for p in lsi_paths]
self.hasrun = False
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
# this is meant to be an interface, not created directly # this is meant to be an interface, not created directly
class clustering_job: class clustering_job:
@ -86,19 +33,24 @@ class clustering_job:
name=self.name, name=self.name,
n_clusters=self.n_clusters, n_clusters=self.n_clusters,
n_isolates=self.n_isolates, n_isolates=self.n_isolates,
silhouette_samples = str(self.silsampout.resolve()) silhouette_samples = self.silsampout
) )
return self.result return self.result
def silhouette(self): def silhouette(self):
isolates = self.clustering.labels_ == -1 isolates = self.clustering.labels_ == -1
scoremat = self.mat[~isolates][:,~isolates] scoremat = self.mat[~isolates][:,~isolates]
if scoremat.shape[0] > 0:
score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
self.outpath.mkdir(parents=True, exist_ok=True) self.outpath.mkdir(parents=True, exist_ok=True)
self.silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather") silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather")
self.silsampout = silsampout.resolve()
silhouette_samp.to_feather(self.silsampout) silhouette_samp.to_feather(self.silsampout)
else:
score = None
self.silsampout = None
return score return score
def read_distance_mat(self, similarities, use_threads=True): def read_distance_mat(self, similarities, use_threads=True):
@ -139,11 +91,6 @@ class clustering_job:
return cluster_data return cluster_data
class lsi_mixin():
def set_lsi_dims(self, lsi_dims):
self.lsi_dims = lsi_dims
@dataclass @dataclass
class clustering_result: class clustering_result:
outpath:Path outpath:Path
@ -152,7 +99,3 @@ class clustering_result:
n_clusters:int n_clusters:int
n_isolates:int n_isolates:int
silhouette_samples:str silhouette_samples:str
@dataclass
class lsi_result_mixin:
lsi_dimensions:int

32
clustering/grid_sweep.py Normal file
View File

@ -0,0 +1,32 @@
from pathlib import Path
from multiprocessing import Pool, cpu_count
from itertools import product, chain
import pandas as pd
class grid_sweep:
def __init__(self, jobtype, inpath, outpath, namer, *args):
self.jobtype = jobtype
self.namer = namer
grid = list(product(*args))
inpath = Path(inpath)
outpath = Path(outpath)
self.hasrun = False
self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
self.jobs = [jobtype(*g) for g in self.grid]
def run(self, cores=20):
if cores is not None and cores > 1:
with Pool(cores) as pool:
infos = pool.map(self.jobtype.get_info, self.jobs)
else:
infos = map(self.jobtype.get_info, self.jobs)
self.infos = pd.DataFrame(infos)
self.hasrun = True
def save(self, outcsv):
if not self.hasrun:
self.run()
outcsv = Path(outcsv)
outcsv.parent.mkdir(parents=True, exist_ok=True)
self.infos.to_csv(outcsv)

View File

@ -1,5 +1,5 @@
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat from clustering_base import clustering_result, clustering_job
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep from grid_sweep import grid_sweep
from dataclasses import dataclass from dataclasses import dataclass
import hdbscan import hdbscan
from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors
@ -7,11 +7,8 @@ import plotnine as pn
import numpy as np import numpy as np
from itertools import product, starmap, chain from itertools import product, starmap, chain
import pandas as pd import pandas as pd
from sklearn.metrics import silhouette_score, silhouette_samples from multiprocessing import cpu_count
from pathlib import Path
from multiprocessing import Pool, cpu_count
import fire import fire
from pyarrow.feather import write_feather
def test_select_hdbscan_clustering(): def test_select_hdbscan_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
@ -40,28 +37,6 @@ def test_select_hdbscan_clustering():
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods
):
super().__init__(hdbscan_lsi_job,
_hdbscan_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods)
class hdbscan_grid_sweep(grid_sweep): class hdbscan_grid_sweep(grid_sweep):
def __init__(self, def __init__(self,
inpath, inpath,
@ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep):
cluster_selection_method): cluster_selection_method):
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
class _hdbscan_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
self.lsi_dim = lsi_dim
self.jobtype = hdbscan_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
def namer(self, *args, **kwargs):
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
@dataclass @dataclass
class hdbscan_clustering_result(clustering_result): class hdbscan_clustering_result(clustering_result):
min_cluster_size:int min_cluster_size:int
@ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result):
cluster_selection_epsilon:float cluster_selection_epsilon:float
cluster_selection_method:str cluster_selection_method:str
@dataclass
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
pass
class hdbscan_job(clustering_job): class hdbscan_job(clustering_job):
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
super().__init__(infile, super().__init__(infile,
@ -148,121 +100,29 @@ class hdbscan_job(clustering_job):
cluster_selection_method=self.cluster_selection_method) cluster_selection_method=self.cluster_selection_method)
return self.result return self.result
class hdbscan_lsi_job(hdbscan_job, lsi_mixin): def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): """Run hdbscan clustering once or more with different parameters.
super().__init__(
infile, Usage:
hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to feather data containing a labeled matrix of subreddit similarities.
outpath: path to output fit kmeans clusterings.
min_cluster_sizes: one or more integers indicating the minumum cluster size
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
cluster_selection_method: "eom" or "leaf" eom gives larger clusters.
"""
obj = hdbscan_grid_sweep(inpath,
outpath, outpath,
name, map(int,min_cluster_sizes),
*args, map(int,min_samples),
**kwargs) map(float,cluster_selection_epsilons),
super().set_lsi_dims(lsi_dims) map(float,cluster_selection_methods))
obj.run()
def get_info(self): obj.save(savefile)
partial_result = super().get_info()
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
# def select_hdbscan_clustering(inpath,
# outpath,
# outfile=None,
# min_cluster_sizes=[2],
# min_samples=[1],
# cluster_selection_epsilons=[0],
# cluster_selection_methods=['eom'],
# lsi_dimensions='all'
# ):
# inpath = Path(inpath)
# outpath = Path(outpath)
# outpath.mkdir(exist_ok=True, parents=True)
# if lsi_dimensions is None:
# lsi_paths = [inpath]
# elif lsi_dimensions == 'all':
# lsi_paths = list(inpath.glob("*"))
# else:
# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
# if lsi_dimensions is not None:
# lsi_nums = [p.stem for p in lsi_paths]
# else:
# lsi_nums = [None]
# grid = list(product(lsi_nums,
# min_cluster_sizes,
# min_samples,
# cluster_selection_epsilons,
# cluster_selection_methods))
# # fix the output file names
# names = list(map(lambda t:'_'.join(map(str,t)),grid))
# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
# with Pool(int(cpu_count()/4)) as pool:
# mods = starmap(hdbscan_clustering, grid)
# res = pd.DataFrame(mods)
# if outfile is None:
# outfile = outpath / "selection_data.csv"
# res.to_csv(outfile)
# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
# subreddits, mat = read_similarity_mat(similarities)
# mat = sim_to_dist(mat)
# clustering = _hdbscan_clustering(mat,
# min_cluster_size=min_cluster_size,
# min_samples=min_samples,
# cluster_selection_epsilon=cluster_selection_epsilon,
# cluster_selection_method=cluster_selection_method,
# metric='precomputed',
# core_dist_n_jobs=cpu_count()
# )
# cluster_data = process_clustering_result(clustering, subreddits)
# isolates = clustering.labels_ == -1
# scoremat = mat[~isolates][:,~isolates]
# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
# cluster_data.to_feather(output)
# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
# silsampout = output.parent / ("silhouette_samples" + output.name)
# silhouette_samp.to_feather(silsampout)
# result = hdbscan_clustering_result(outpath=output,
# silhouette_samples=silsampout,
# silhouette_score=score,
# name=name,
# min_cluster_size=min_cluster_size,
# min_samples=min_samples,
# cluster_selection_epsilon=cluster_selection_epsilon,
# cluster_selection_method=cluster_selection_method,
# lsi_dimensions=lsi_dim,
# n_isolates=isolates.sum(),
# n_clusters=len(set(clustering.labels_))
# )
# return(result)
# # for all runs we should try cluster_selection_epsilon = None
# # for terms we should try cluster_selection_epsilon around 0.56-0.66
# # for authors we should try cluster_selection_epsilon around 0.98-0.99
# def _hdbscan_clustering(mat, *args, **kwargs):
# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
# print(mat)
# clusterer = hdbscan.HDBSCAN(*args,
# **kwargs,
# )
# clustering = clusterer.fit(mat.astype('double'))
# return(clustering)
def KNN_distances_plot(mat,outname,k=2): def KNN_distances_plot(mat,outname,k=2):
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
@ -293,10 +153,7 @@ def make_KNN_plots():
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire{'grid_sweep':hdbscan_grid_sweep, fire.Fire(run_hdbscan_grid_sweep)
'grid_sweep_lsi':hdbscan_lsi_grid_sweep
'cluster':hdbscan_job,
'cluster_lsi':hdbscan_lsi_job}
# test_select_hdbscan_clustering() # test_select_hdbscan_clustering()
#fire.Fire(select_hdbscan_clustering) #fire.Fire(select_hdbscan_clustering)

View File

@ -0,0 +1,101 @@
from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
from grid_sweep import grid_sweep
import fire
from dataclasses import dataclass
@dataclass
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
pass
class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(
infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
partial_result = super().get_info()
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods
):
super().__init__(hdbscan_lsi_job,
_hdbscan_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods)
class _hdbscan_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
print(args)
print(kwargs)
self.lsi_dim = lsi_dim
self.jobtype = hdbscan_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
def namer(self, *args, **kwargs):
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'],lsi_dimensions='all'):
"""Run hdbscan clustering once or more with different parameters.
Usage:
hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
outpath: path to output fit clusterings.
min_cluster_sizes: one or more integers indicating the minumum cluster size
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters.
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
"""
obj = hdbscan_lsi_grid_sweep(inpath,
lsi_dimensions,
outpath,
map(int,min_cluster_sizes),
map(int,min_samples),
map(float,cluster_selection_epsilons),
cluster_selection_methods
)
obj.run(10)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_hdbscan_lsi_grid_sweep)

View File

@ -1,11 +1,9 @@
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
import fire import fire
from pathlib import Path from pathlib import Path
from multiprocessing import cpu_count
from dataclasses import dataclass from dataclasses import dataclass
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat from clustering_base import clustering_result, clustering_job
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep from grid_sweep import grid_sweep
@dataclass @dataclass
class kmeans_clustering_result(clustering_result): class kmeans_clustering_result(clustering_result):
@ -13,10 +11,6 @@ class kmeans_clustering_result(clustering_result):
n_init:int n_init:int
max_iter:int max_iter:int
@dataclass
class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
pass
class kmeans_job(clustering_job): class kmeans_job(clustering_job):
def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
super().__init__(infile, super().__init__(infile,
@ -45,28 +39,13 @@ class kmeans_job(clustering_job):
def get_info(self): def get_info(self):
result = super().get_info() result = super().get_info()
self.result = kmeans_clustering_result(**result.__dict__, self.result = kmeans_clustering_result(**result.__dict__,
n_init=n_init, n_init=self.n_init,
max_iter=max_iter) max_iter=self.max_iter)
return self.result
class kmeans_lsi_job(kmeans_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
result = super().get_info()
self.result = kmeans_clustering_result_lsi(**result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result return self.result
class kmeans_grid_sweep(grid_sweep): class kmeans_grid_sweep(grid_sweep):
def __init__(self, def __init__(self,
inpath, inpath,
outpath, outpath,
@ -80,49 +59,7 @@ class kmeans_grid_sweep(grid_sweep):
max_iter): max_iter):
return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}" return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
class _kmeans_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
self.lsi_dim = lsi_dim
self.jobtype = kmeans_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
def namer(self, *args, **kwargs):
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
class kmeans_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
n_clusters,
n_inits,
max_iters
):
super().__init__(kmeans_lsi_job,
_kmeans_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
n_clusters,
n_inits,
max_iters)
def test_select_kmeans_clustering(): def test_select_kmeans_clustering():
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
# "test_hdbscan_author30k",
# min_cluster_sizes=[2],
# min_samples=[1,2],
# cluster_selection_epsilons=[0,0.05,0.1,0.15],
# cluster_selection_methods=['eom','leaf'],
# lsi_dimensions='all')
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
outpath = "test_kmeans"; outpath = "test_kmeans";
n_clusters=[200,300,400]; n_clusters=[200,300,400];
@ -139,10 +76,30 @@ def test_select_kmeans_clustering():
gs.run(20) gs.run(20)
gs.save("test_hdbscan/lsi_sweep.csv") gs.save("test_hdbscan/lsi_sweep.csv")
def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]):
"""Run kmeans clustering once or more with different parameters.
Usage:
kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to feather data containing a labeled matrix of subreddit similarities.
outpath: path to output fit kmeans clusterings.
n_clusters: one or more numbers of kmeans clusters to select.
n_inits: one or more numbers of different initializations to use for each clustering.
max_iters: one or more numbers of different maximum interations.
"""
obj = kmeans_grid_sweep(inpath,
outpath,
map(int,n_clusters),
map(int,n_inits),
map(int,max_iters))
obj.run(1)
obj.save(savefile)
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire(run_kmeans_grid_sweep)
fire.Fire{'grid_sweep':kmeans_grid_sweep,
'grid_sweep_lsi':kmeans_lsi_grid_sweep
'cluster':kmeans_job,
'cluster_lsi':kmeans_lsi_job}

View File

@ -0,0 +1,93 @@
import fire
from dataclasses import dataclass
from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep
from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep
from grid_sweep import grid_sweep
@dataclass
class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
pass
class kmeans_lsi_job(kmeans_job, lsi_mixin):
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
super().__init__(infile,
outpath,
name,
*args,
**kwargs)
super().set_lsi_dims(lsi_dims)
def get_info(self):
result = super().get_info()
self.result = kmeans_clustering_result_lsi(**result.__dict__,
lsi_dimensions=self.lsi_dims)
return self.result
class _kmeans_lsi_grid_sweep(grid_sweep):
def __init__(self,
inpath,
outpath,
lsi_dim,
*args,
**kwargs):
print(args)
print(kwargs)
self.lsi_dim = lsi_dim
self.jobtype = kmeans_lsi_job
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
def namer(self, *args, **kwargs):
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
s += f"_lsi-{self.lsi_dim}"
return s
class kmeans_lsi_grid_sweep(lsi_grid_sweep):
def __init__(self,
inpath,
lsi_dims,
outpath,
n_clusters,
n_inits,
max_iters
):
super().__init__(kmeans_lsi_job,
_kmeans_lsi_grid_sweep,
inpath,
lsi_dims,
outpath,
n_clusters,
n_inits,
max_iters)
def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"):
"""Run kmeans clustering once or more with different parameters.
Usage:
kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
Keword arguments:
savefile: path to save the metadata and diagnostics
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
outpath: path to output fit kmeans clusterings.
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
n_clusters: one or more numbers of kmeans clusters to select.
n_inits: one or more numbers of different initializations to use for each clustering.
max_iters: one or more numbers of different maximum interations.
"""
obj = kmeans_lsi_grid_sweep(inpath,
lsi_dimensions,
outpath,
list(map(int,n_clusters)),
list(map(int,n_inits)),
list(map(int,max_iters))
)
obj.run(1)
obj.save(savefile)
if __name__ == "__main__":
fire.Fire(run_kmeans_lsi_grid_sweep)

28
clustering/lsi_base.py Normal file
View File

@ -0,0 +1,28 @@
from clustering_base import clustering_job, clustering_result
from grid_sweep import grid_sweep
from dataclasses import dataclass
from itertools import chain
from pathlib import Path
class lsi_mixin():
def set_lsi_dims(self, lsi_dims):
self.lsi_dims = lsi_dims
@dataclass
class lsi_result_mixin:
lsi_dimensions:int
class lsi_grid_sweep(grid_sweep):
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
self.jobtype = jobtype
self.subsweep = subsweep
inpath = Path(inpath)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*"))
else:
lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
lsi_nums = [p.stem for p in lsi_paths]
self.hasrun = False
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))