Refactor to make a decent api.
This commit is contained in:
parent
f05cb962e0
commit
4cb7eeec80
@ -2,41 +2,160 @@
|
|||||||
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
|
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
|
||||||
similarity_data=/gscratch/comdata/output/reddit_similarity
|
similarity_data=/gscratch/comdata/output/reddit_similarity
|
||||||
clustering_data=/gscratch/comdata/output/reddit_clustering
|
clustering_data=/gscratch/comdata/output/reddit_clustering
|
||||||
kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]"
|
kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
|
||||||
#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
|
hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
|
||||||
all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
|
affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]"
|
||||||
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
|
|
||||||
# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
|
|
||||||
|
|
||||||
$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
|
authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
|
||||||
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
|
authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
|
||||||
|
authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k
|
||||||
|
authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI
|
||||||
|
|
||||||
$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
|
authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather
|
||||||
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
|
authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
|
||||||
|
authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k
|
||||||
|
authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
|
||||||
|
|
||||||
$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
|
terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather
|
||||||
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
|
terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI
|
||||||
|
terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k
|
||||||
|
terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI
|
||||||
|
|
||||||
|
all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi
|
||||||
|
|
||||||
|
terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv
|
||||||
|
|
||||||
|
authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv
|
||||||
|
|
||||||
|
terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
|
||||||
|
authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
|
||||||
|
${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py
|
||||||
|
$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
|
||||||
|
|
||||||
|
${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py
|
||||||
|
$(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
|
||||||
|
|
||||||
|
${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py
|
||||||
|
$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid)
|
||||||
|
|
||||||
|
${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py
|
||||||
|
$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
|
||||||
|
|
||||||
|
${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py
|
||||||
|
$(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
|
||||||
|
|
||||||
|
${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py
|
||||||
|
$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid)
|
||||||
|
|
||||||
|
${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py
|
||||||
|
$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py
|
||||||
|
$(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py
|
||||||
|
$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
|
||||||
affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
|
## LSI Models
|
||||||
$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
|
${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py
|
||||||
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
|
$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
|
||||||
|
|
||||||
$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
|
${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py
|
||||||
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
|
$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
|
||||||
|
|
||||||
$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
|
${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
|
||||||
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
|
$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
|
||||||
|
|
||||||
clean:
|
${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py
|
||||||
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
|
$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
|
||||||
rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv
|
|
||||||
rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv
|
|
||||||
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv
|
|
||||||
rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv
|
|
||||||
rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv
|
|
||||||
|
|
||||||
PHONY: clean
|
${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py
|
||||||
|
$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
|
||||||
|
|
||||||
|
${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
|
||||||
|
$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
|
||||||
|
|
||||||
|
${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
||||||
|
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
||||||
|
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
||||||
|
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
clean_affinity:
|
||||||
|
rm -f ${authors_10k_output}/affinity/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output}/affinity/selection_data.csv
|
||||||
|
|
||||||
|
clean_kmeans:
|
||||||
|
rm -f ${authors_10k_output}/kmeans/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output}/kmeans/selection_data.csv
|
||||||
|
|
||||||
|
clean_hdbscan:
|
||||||
|
rm -f ${authors_10k_output}/hdbscan/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_authors:
|
||||||
|
rm -f ${authors_10k_output}/affinity/selection_data.csv
|
||||||
|
rm -f ${authors_10k_output}/kmeans/selection_data.csv
|
||||||
|
rm -f ${authors_10k_output}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_authors_tf:
|
||||||
|
rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_terms:
|
||||||
|
rm -f ${terms_10k_output}/affinity/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output}/kmeans/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_lsi_affinity:
|
||||||
|
rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
|
||||||
|
clean_lsi_kmeans:
|
||||||
|
rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
|
||||||
|
|
||||||
|
clean_lsi_hdbscan:
|
||||||
|
rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_lsi_authors:
|
||||||
|
rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
|
||||||
|
rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_lsi_authors_tf:
|
||||||
|
rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
|
||||||
|
rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean_lsi_terms:
|
||||||
|
rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
|
||||||
|
rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
|
||||||
|
|
||||||
|
clean: clean_affinity clean_kmeans clean_hdbscan
|
||||||
|
|
||||||
|
PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k
|
||||||
|
|
||||||
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
|
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
|
||||||
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
|
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
|
||||||
|
@ -1,16 +1,12 @@
|
|||||||
from sklearn.metrics import silhouette_score
|
|
||||||
from sklearn.cluster import AffinityPropagation
|
from sklearn.cluster import AffinityPropagation
|
||||||
from functools import partial
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
|
from clustering_base import clustering_result, clustering_job
|
||||||
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
|
from grid_sweep import grid_sweep
|
||||||
from multiprocessing import Pool, cpu_count, Array, Process
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from itertools import product, starmap
|
from itertools import product, starmap
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import fire
|
import fire
|
||||||
import sys
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
|
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -21,10 +17,6 @@ class affinity_clustering_result(clustering_result):
|
|||||||
preference:float
|
preference:float
|
||||||
max_iter:int
|
max_iter:int
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class affinity_job(clustering_job):
|
class affinity_job(clustering_job):
|
||||||
def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
|
def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
|
||||||
super().__init__(infile,
|
super().__init__(infile,
|
||||||
@ -67,21 +59,6 @@ class affinity_job(clustering_job):
|
|||||||
|
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
class affinity_lsi_job(affinity_job, lsi_mixin):
|
|
||||||
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
|
||||||
super().__init__(infile,
|
|
||||||
outpath,
|
|
||||||
name,
|
|
||||||
*args,
|
|
||||||
**kwargs)
|
|
||||||
super().set_lsi_dims(lsi_dims)
|
|
||||||
|
|
||||||
def get_info(self):
|
|
||||||
result = super().get_info()
|
|
||||||
self.result = affinity_clustering_result_lsi(**result.__dict__,
|
|
||||||
lsi_dimensions=self.lsi_dims)
|
|
||||||
return self.result
|
|
||||||
|
|
||||||
class affinity_grid_sweep(grid_sweep):
|
class affinity_grid_sweep(grid_sweep):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
inpath,
|
inpath,
|
||||||
@ -104,49 +81,29 @@ class affinity_grid_sweep(grid_sweep):
|
|||||||
|
|
||||||
return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
|
return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
|
||||||
|
|
||||||
class _affinity_lsi_grid_sweep(grid_sweep):
|
def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]):
|
||||||
def __init__(self,
|
"""Run affinity clustering once or more with different parameters.
|
||||||
inpath,
|
|
||||||
outpath,
|
|
||||||
lsi_dim,
|
|
||||||
*args,
|
|
||||||
**kwargs):
|
|
||||||
self.lsi_dim = lsi_dim
|
|
||||||
self.jobtype = affinity_lsi_job
|
|
||||||
super().__init__(self.jobtype,
|
|
||||||
inpath,
|
|
||||||
outpath,
|
|
||||||
self.namer,
|
|
||||||
self.lsi_dim,
|
|
||||||
*args,
|
|
||||||
**kwargs)
|
|
||||||
|
|
||||||
def namer(self, *args, **kwargs):
|
|
||||||
s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
|
|
||||||
s += f"_lsi-{self.lsi_dim}"
|
|
||||||
return s
|
|
||||||
|
|
||||||
class affinity_lsi_grid_sweep(lsi_grid_sweep):
|
|
||||||
def __init__(self,
|
|
||||||
inpath,
|
|
||||||
lsi_dims,
|
|
||||||
outpath,
|
|
||||||
dampings=[0.9],
|
|
||||||
max_iters=[10000],
|
|
||||||
convergence_iters=[30],
|
|
||||||
preference_quantiles=[0.5]):
|
|
||||||
|
|
||||||
super().__init__(affinity_lsi_job,
|
|
||||||
_affinity_lsi_grid_sweep,
|
|
||||||
inpath,
|
|
||||||
lsi_dims,
|
|
||||||
outpath,
|
|
||||||
dampings,
|
|
||||||
max_iters,
|
|
||||||
convergence_iters,
|
|
||||||
preference_quantiles)
|
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv>
|
||||||
|
|
||||||
|
Keword arguments:
|
||||||
|
savefile: path to save the metadata and diagnostics
|
||||||
|
inpath: path to feather data containing a labeled matrix of subreddit similarities.
|
||||||
|
outpath: path to output fit kmeans clusterings.
|
||||||
|
dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
|
||||||
|
preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
|
||||||
|
convergence_iters:one or more integers of number of iterations without improvement before stopping.
|
||||||
|
max_iters: one or more numbers of different maximum interations.
|
||||||
|
"""
|
||||||
|
obj = affinity_grid_sweep(inpath,
|
||||||
|
outpath,
|
||||||
|
map(float,dampings),
|
||||||
|
map(int,max_iters),
|
||||||
|
map(int,convergence_iters),
|
||||||
|
map(float,preference_quantiles))
|
||||||
|
obj.run(1)
|
||||||
|
obj.save(savefile)
|
||||||
|
|
||||||
def test_select_affinity_clustering():
|
def test_select_affinity_clustering():
|
||||||
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
|
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
|
||||||
@ -169,7 +126,4 @@ def test_select_affinity_clustering():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
fire.Fire{'grid_sweep':affinity_grid_sweep,
|
fire.Fire(run_affinity_grid_sweep)
|
||||||
'grid_sweep_lsi':affinity_lsi_grid_sweep
|
|
||||||
'cluster':affinity_job,
|
|
||||||
'cluster_lsi':affinity_lsi_job}
|
|
||||||
|
99
clustering/affinity_clustering_lsi.py
Normal file
99
clustering/affinity_clustering_lsi.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import fire
|
||||||
|
from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
|
||||||
|
from grid_sweep import grid_sweep
|
||||||
|
from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class affinity_lsi_job(affinity_job, lsi_mixin):
|
||||||
|
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
||||||
|
super().__init__(infile,
|
||||||
|
outpath,
|
||||||
|
name,
|
||||||
|
*args,
|
||||||
|
**kwargs)
|
||||||
|
super().set_lsi_dims(lsi_dims)
|
||||||
|
|
||||||
|
def get_info(self):
|
||||||
|
result = super().get_info()
|
||||||
|
self.result = affinity_clustering_result_lsi(**result.__dict__,
|
||||||
|
lsi_dimensions=self.lsi_dims)
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
class affinity_lsi_grid_sweep(lsi_grid_sweep):
|
||||||
|
def __init__(self,
|
||||||
|
inpath,
|
||||||
|
lsi_dims,
|
||||||
|
outpath,
|
||||||
|
dampings=[0.9],
|
||||||
|
max_iters=[10000],
|
||||||
|
convergence_iters=[30],
|
||||||
|
preference_quantiles=[0.5]):
|
||||||
|
|
||||||
|
super().__init__(affinity_lsi_job,
|
||||||
|
_affinity_lsi_grid_sweep,
|
||||||
|
inpath,
|
||||||
|
lsi_dims,
|
||||||
|
outpath,
|
||||||
|
dampings,
|
||||||
|
max_iters,
|
||||||
|
convergence_iters,
|
||||||
|
preference_quantiles)
|
||||||
|
|
||||||
|
|
||||||
|
class _affinity_lsi_grid_sweep(grid_sweep):
|
||||||
|
def __init__(self,
|
||||||
|
inpath,
|
||||||
|
outpath,
|
||||||
|
lsi_dim,
|
||||||
|
*args,
|
||||||
|
**kwargs):
|
||||||
|
self.lsi_dim = lsi_dim
|
||||||
|
self.jobtype = affinity_lsi_job
|
||||||
|
super().__init__(self.jobtype,
|
||||||
|
inpath,
|
||||||
|
outpath,
|
||||||
|
self.namer,
|
||||||
|
self.lsi_dim,
|
||||||
|
*args,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
def namer(self, *args, **kwargs):
|
||||||
|
s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
|
||||||
|
s += f"_lsi-{self.lsi_dim}"
|
||||||
|
return s
|
||||||
|
|
||||||
|
def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'):
|
||||||
|
"""Run affinity clustering once or more with different parameters.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
||||||
|
|
||||||
|
Keword arguments:
|
||||||
|
savefile: path to save the metadata and diagnostics
|
||||||
|
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
|
||||||
|
outpath: path to output fit kmeans clusterings.
|
||||||
|
dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering.
|
||||||
|
preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
|
||||||
|
convergence_iters:one or more integers of number of iterations without improvement before stopping.
|
||||||
|
max_iters: one or more numbers of different maximum interations.
|
||||||
|
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
||||||
|
"""
|
||||||
|
|
||||||
|
obj = affinity_lsi_grid_sweep(inpath,
|
||||||
|
lsi_dimensions,
|
||||||
|
outpath,
|
||||||
|
map(float,dampings),
|
||||||
|
map(int,max_iters),
|
||||||
|
map(int,convergence_iters),
|
||||||
|
map(float,preference_quantiles))
|
||||||
|
|
||||||
|
obj.run(1)
|
||||||
|
obj.save(savefile)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire(run_affinity_lsi_grid_sweep)
|
@ -3,59 +3,6 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from sklearn.metrics import silhouette_score, silhouette_samples
|
from sklearn.metrics import silhouette_score, silhouette_samples
|
||||||
from itertools import product, chain
|
|
||||||
from multiprocessing import Pool, cpu_count
|
|
||||||
|
|
||||||
def sim_to_dist(mat):
|
|
||||||
dist = 1-mat
|
|
||||||
dist[dist < 0] = 0
|
|
||||||
np.fill_diagonal(dist,0)
|
|
||||||
return dist
|
|
||||||
|
|
||||||
class grid_sweep:
|
|
||||||
def __init__(self, jobtype, inpath, outpath, namer, *args):
|
|
||||||
self.jobtype = jobtype
|
|
||||||
self.namer = namer
|
|
||||||
grid = list(product(*args))
|
|
||||||
inpath = Path(inpath)
|
|
||||||
outpath = Path(outpath)
|
|
||||||
self.hasrun = False
|
|
||||||
self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
|
|
||||||
self.jobs = [jobtype(*g) for g in self.grid]
|
|
||||||
|
|
||||||
def run(self, cores=20):
|
|
||||||
if cores is not None and cores > 1:
|
|
||||||
with Pool(cores) as pool:
|
|
||||||
infos = pool.map(self.jobtype.get_info, self.jobs)
|
|
||||||
else:
|
|
||||||
infos = map(self.jobtype.get_info, self.jobs)
|
|
||||||
|
|
||||||
self.infos = pd.DataFrame(infos)
|
|
||||||
self.hasrun = True
|
|
||||||
|
|
||||||
def save(self, outcsv):
|
|
||||||
if not self.hasrun:
|
|
||||||
self.run()
|
|
||||||
outcsv = Path(outcsv)
|
|
||||||
outcsv.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
self.infos.to_csv(outcsv)
|
|
||||||
|
|
||||||
|
|
||||||
class lsi_grid_sweep(grid_sweep):
|
|
||||||
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
|
|
||||||
self.jobtype = jobtype
|
|
||||||
self.subsweep = subsweep
|
|
||||||
inpath = Path(inpath)
|
|
||||||
if lsi_dimensions == 'all':
|
|
||||||
lsi_paths = list(inpath.glob("*"))
|
|
||||||
else:
|
|
||||||
lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
|
|
||||||
|
|
||||||
lsi_nums = [p.stem for p in lsi_paths]
|
|
||||||
self.hasrun = False
|
|
||||||
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
|
|
||||||
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
|
|
||||||
|
|
||||||
|
|
||||||
# this is meant to be an interface, not created directly
|
# this is meant to be an interface, not created directly
|
||||||
class clustering_job:
|
class clustering_job:
|
||||||
@ -86,19 +33,24 @@ class clustering_job:
|
|||||||
name=self.name,
|
name=self.name,
|
||||||
n_clusters=self.n_clusters,
|
n_clusters=self.n_clusters,
|
||||||
n_isolates=self.n_isolates,
|
n_isolates=self.n_isolates,
|
||||||
silhouette_samples = str(self.silsampout.resolve())
|
silhouette_samples = self.silsampout
|
||||||
)
|
)
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
def silhouette(self):
|
def silhouette(self):
|
||||||
isolates = self.clustering.labels_ == -1
|
isolates = self.clustering.labels_ == -1
|
||||||
scoremat = self.mat[~isolates][:,~isolates]
|
scoremat = self.mat[~isolates][:,~isolates]
|
||||||
score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
|
if scoremat.shape[0] > 0:
|
||||||
silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
|
score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
|
||||||
silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
|
silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
|
||||||
self.outpath.mkdir(parents=True, exist_ok=True)
|
silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
|
||||||
self.silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather")
|
self.outpath.mkdir(parents=True, exist_ok=True)
|
||||||
silhouette_samp.to_feather(self.silsampout)
|
silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather")
|
||||||
|
self.silsampout = silsampout.resolve()
|
||||||
|
silhouette_samp.to_feather(self.silsampout)
|
||||||
|
else:
|
||||||
|
score = None
|
||||||
|
self.silsampout = None
|
||||||
return score
|
return score
|
||||||
|
|
||||||
def read_distance_mat(self, similarities, use_threads=True):
|
def read_distance_mat(self, similarities, use_threads=True):
|
||||||
@ -139,11 +91,6 @@ class clustering_job:
|
|||||||
|
|
||||||
return cluster_data
|
return cluster_data
|
||||||
|
|
||||||
|
|
||||||
class lsi_mixin():
|
|
||||||
def set_lsi_dims(self, lsi_dims):
|
|
||||||
self.lsi_dims = lsi_dims
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class clustering_result:
|
class clustering_result:
|
||||||
outpath:Path
|
outpath:Path
|
||||||
@ -152,7 +99,3 @@ class clustering_result:
|
|||||||
n_clusters:int
|
n_clusters:int
|
||||||
n_isolates:int
|
n_isolates:int
|
||||||
silhouette_samples:str
|
silhouette_samples:str
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class lsi_result_mixin:
|
|
||||||
lsi_dimensions:int
|
|
||||||
|
32
clustering/grid_sweep.py
Normal file
32
clustering/grid_sweep.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from multiprocessing import Pool, cpu_count
|
||||||
|
from itertools import product, chain
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class grid_sweep:
|
||||||
|
def __init__(self, jobtype, inpath, outpath, namer, *args):
|
||||||
|
self.jobtype = jobtype
|
||||||
|
self.namer = namer
|
||||||
|
grid = list(product(*args))
|
||||||
|
inpath = Path(inpath)
|
||||||
|
outpath = Path(outpath)
|
||||||
|
self.hasrun = False
|
||||||
|
self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
|
||||||
|
self.jobs = [jobtype(*g) for g in self.grid]
|
||||||
|
|
||||||
|
def run(self, cores=20):
|
||||||
|
if cores is not None and cores > 1:
|
||||||
|
with Pool(cores) as pool:
|
||||||
|
infos = pool.map(self.jobtype.get_info, self.jobs)
|
||||||
|
else:
|
||||||
|
infos = map(self.jobtype.get_info, self.jobs)
|
||||||
|
|
||||||
|
self.infos = pd.DataFrame(infos)
|
||||||
|
self.hasrun = True
|
||||||
|
|
||||||
|
def save(self, outcsv):
|
||||||
|
if not self.hasrun:
|
||||||
|
self.run()
|
||||||
|
outcsv = Path(outcsv)
|
||||||
|
outcsv.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.infos.to_csv(outcsv)
|
@ -1,5 +1,5 @@
|
|||||||
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
|
from clustering_base import clustering_result, clustering_job
|
||||||
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
|
from grid_sweep import grid_sweep
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import hdbscan
|
import hdbscan
|
||||||
from sklearn.neighbors import NearestNeighbors
|
from sklearn.neighbors import NearestNeighbors
|
||||||
@ -7,11 +7,8 @@ import plotnine as pn
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from itertools import product, starmap, chain
|
from itertools import product, starmap, chain
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.metrics import silhouette_score, silhouette_samples
|
from multiprocessing import cpu_count
|
||||||
from pathlib import Path
|
|
||||||
from multiprocessing import Pool, cpu_count
|
|
||||||
import fire
|
import fire
|
||||||
from pyarrow.feather import write_feather
|
|
||||||
|
|
||||||
def test_select_hdbscan_clustering():
|
def test_select_hdbscan_clustering():
|
||||||
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
|
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
|
||||||
@ -40,28 +37,6 @@ def test_select_hdbscan_clustering():
|
|||||||
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
|
# check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
|
||||||
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
|
# silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
|
||||||
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
|
# c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)
|
||||||
|
|
||||||
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
|
|
||||||
def __init__(self,
|
|
||||||
inpath,
|
|
||||||
lsi_dims,
|
|
||||||
outpath,
|
|
||||||
min_cluster_sizes,
|
|
||||||
min_samples,
|
|
||||||
cluster_selection_epsilons,
|
|
||||||
cluster_selection_methods
|
|
||||||
):
|
|
||||||
|
|
||||||
super().__init__(hdbscan_lsi_job,
|
|
||||||
_hdbscan_lsi_grid_sweep,
|
|
||||||
inpath,
|
|
||||||
lsi_dims,
|
|
||||||
outpath,
|
|
||||||
min_cluster_sizes,
|
|
||||||
min_samples,
|
|
||||||
cluster_selection_epsilons,
|
|
||||||
cluster_selection_methods)
|
|
||||||
|
|
||||||
class hdbscan_grid_sweep(grid_sweep):
|
class hdbscan_grid_sweep(grid_sweep):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
inpath,
|
inpath,
|
||||||
@ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep):
|
|||||||
cluster_selection_method):
|
cluster_selection_method):
|
||||||
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
|
return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
|
||||||
|
|
||||||
|
|
||||||
class _hdbscan_lsi_grid_sweep(grid_sweep):
|
|
||||||
def __init__(self,
|
|
||||||
inpath,
|
|
||||||
outpath,
|
|
||||||
lsi_dim,
|
|
||||||
*args,
|
|
||||||
**kwargs):
|
|
||||||
|
|
||||||
self.lsi_dim = lsi_dim
|
|
||||||
self.jobtype = hdbscan_lsi_job
|
|
||||||
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def namer(self, *args, **kwargs):
|
|
||||||
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
|
|
||||||
s += f"_lsi-{self.lsi_dim}"
|
|
||||||
return s
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class hdbscan_clustering_result(clustering_result):
|
class hdbscan_clustering_result(clustering_result):
|
||||||
min_cluster_size:int
|
min_cluster_size:int
|
||||||
@ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result):
|
|||||||
cluster_selection_epsilon:float
|
cluster_selection_epsilon:float
|
||||||
cluster_selection_method:str
|
cluster_selection_method:str
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class hdbscan_job(clustering_job):
|
class hdbscan_job(clustering_job):
|
||||||
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
|
def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
|
||||||
super().__init__(infile,
|
super().__init__(infile,
|
||||||
@ -148,121 +100,29 @@ class hdbscan_job(clustering_job):
|
|||||||
cluster_selection_method=self.cluster_selection_method)
|
cluster_selection_method=self.cluster_selection_method)
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
|
def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
|
||||||
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
"""Run hdbscan clustering once or more with different parameters.
|
||||||
super().__init__(
|
|
||||||
infile,
|
|
||||||
outpath,
|
|
||||||
name,
|
|
||||||
*args,
|
|
||||||
**kwargs)
|
|
||||||
super().set_lsi_dims(lsi_dims)
|
|
||||||
|
|
||||||
def get_info(self):
|
|
||||||
partial_result = super().get_info()
|
|
||||||
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
|
|
||||||
lsi_dimensions=self.lsi_dims)
|
|
||||||
return self.result
|
|
||||||
|
|
||||||
# def select_hdbscan_clustering(inpath,
|
|
||||||
# outpath,
|
|
||||||
# outfile=None,
|
|
||||||
# min_cluster_sizes=[2],
|
|
||||||
# min_samples=[1],
|
|
||||||
# cluster_selection_epsilons=[0],
|
|
||||||
# cluster_selection_methods=['eom'],
|
|
||||||
# lsi_dimensions='all'
|
|
||||||
# ):
|
|
||||||
|
|
||||||
# inpath = Path(inpath)
|
|
||||||
# outpath = Path(outpath)
|
|
||||||
# outpath.mkdir(exist_ok=True, parents=True)
|
|
||||||
|
|
||||||
# if lsi_dimensions is None:
|
Usage:
|
||||||
# lsi_paths = [inpath]
|
hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
|
||||||
# elif lsi_dimensions == 'all':
|
|
||||||
# lsi_paths = list(inpath.glob("*"))
|
|
||||||
|
|
||||||
# else:
|
Keword arguments:
|
||||||
# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
|
savefile: path to save the metadata and diagnostics
|
||||||
|
inpath: path to feather data containing a labeled matrix of subreddit similarities.
|
||||||
# if lsi_dimensions is not None:
|
outpath: path to output fit kmeans clusterings.
|
||||||
# lsi_nums = [p.stem for p in lsi_paths]
|
min_cluster_sizes: one or more integers indicating the minumum cluster size
|
||||||
# else:
|
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
|
||||||
# lsi_nums = [None]
|
cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
|
||||||
# grid = list(product(lsi_nums,
|
cluster_selection_method: "eom" or "leaf" eom gives larger clusters.
|
||||||
# min_cluster_sizes,
|
"""
|
||||||
# min_samples,
|
obj = hdbscan_grid_sweep(inpath,
|
||||||
# cluster_selection_epsilons,
|
outpath,
|
||||||
# cluster_selection_methods))
|
map(int,min_cluster_sizes),
|
||||||
|
map(int,min_samples),
|
||||||
# # fix the output file names
|
map(float,cluster_selection_epsilons),
|
||||||
# names = list(map(lambda t:'_'.join(map(str,t)),grid))
|
map(float,cluster_selection_methods))
|
||||||
|
obj.run()
|
||||||
# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
|
obj.save(savefile)
|
||||||
|
|
||||||
# with Pool(int(cpu_count()/4)) as pool:
|
|
||||||
# mods = starmap(hdbscan_clustering, grid)
|
|
||||||
|
|
||||||
# res = pd.DataFrame(mods)
|
|
||||||
# if outfile is None:
|
|
||||||
# outfile = outpath / "selection_data.csv"
|
|
||||||
|
|
||||||
# res.to_csv(outfile)
|
|
||||||
|
|
||||||
# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
|
|
||||||
# subreddits, mat = read_similarity_mat(similarities)
|
|
||||||
# mat = sim_to_dist(mat)
|
|
||||||
# clustering = _hdbscan_clustering(mat,
|
|
||||||
# min_cluster_size=min_cluster_size,
|
|
||||||
# min_samples=min_samples,
|
|
||||||
# cluster_selection_epsilon=cluster_selection_epsilon,
|
|
||||||
# cluster_selection_method=cluster_selection_method,
|
|
||||||
# metric='precomputed',
|
|
||||||
# core_dist_n_jobs=cpu_count()
|
|
||||||
# )
|
|
||||||
|
|
||||||
# cluster_data = process_clustering_result(clustering, subreddits)
|
|
||||||
# isolates = clustering.labels_ == -1
|
|
||||||
# scoremat = mat[~isolates][:,~isolates]
|
|
||||||
# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
|
|
||||||
# cluster_data.to_feather(output)
|
|
||||||
# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
|
|
||||||
# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
|
|
||||||
# silsampout = output.parent / ("silhouette_samples" + output.name)
|
|
||||||
# silhouette_samp.to_feather(silsampout)
|
|
||||||
|
|
||||||
# result = hdbscan_clustering_result(outpath=output,
|
|
||||||
# silhouette_samples=silsampout,
|
|
||||||
# silhouette_score=score,
|
|
||||||
# name=name,
|
|
||||||
# min_cluster_size=min_cluster_size,
|
|
||||||
# min_samples=min_samples,
|
|
||||||
# cluster_selection_epsilon=cluster_selection_epsilon,
|
|
||||||
# cluster_selection_method=cluster_selection_method,
|
|
||||||
# lsi_dimensions=lsi_dim,
|
|
||||||
# n_isolates=isolates.sum(),
|
|
||||||
# n_clusters=len(set(clustering.labels_))
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# return(result)
|
|
||||||
|
|
||||||
# # for all runs we should try cluster_selection_epsilon = None
|
|
||||||
# # for terms we should try cluster_selection_epsilon around 0.56-0.66
|
|
||||||
# # for authors we should try cluster_selection_epsilon around 0.98-0.99
|
|
||||||
# def _hdbscan_clustering(mat, *args, **kwargs):
|
|
||||||
# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
|
|
||||||
|
|
||||||
# print(mat)
|
|
||||||
# clusterer = hdbscan.HDBSCAN(*args,
|
|
||||||
# **kwargs,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# clustering = clusterer.fit(mat.astype('double'))
|
|
||||||
|
|
||||||
# return(clustering)
|
|
||||||
|
|
||||||
def KNN_distances_plot(mat,outname,k=2):
|
def KNN_distances_plot(mat,outname,k=2):
|
||||||
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
|
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
|
||||||
@ -293,10 +153,7 @@ def make_KNN_plots():
|
|||||||
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
|
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
fire.Fire{'grid_sweep':hdbscan_grid_sweep,
|
fire.Fire(run_hdbscan_grid_sweep)
|
||||||
'grid_sweep_lsi':hdbscan_lsi_grid_sweep
|
|
||||||
'cluster':hdbscan_job,
|
|
||||||
'cluster_lsi':hdbscan_lsi_job}
|
|
||||||
|
|
||||||
# test_select_hdbscan_clustering()
|
# test_select_hdbscan_clustering()
|
||||||
#fire.Fire(select_hdbscan_clustering)
|
#fire.Fire(select_hdbscan_clustering)
|
||||||
|
101
clustering/hdbscan_clustering_lsi.py
Normal file
101
clustering/hdbscan_clustering_lsi.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
|
||||||
|
from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
|
||||||
|
from grid_sweep import grid_sweep
|
||||||
|
import fire
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
|
||||||
|
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
||||||
|
super().__init__(
|
||||||
|
infile,
|
||||||
|
outpath,
|
||||||
|
name,
|
||||||
|
*args,
|
||||||
|
**kwargs)
|
||||||
|
super().set_lsi_dims(lsi_dims)
|
||||||
|
|
||||||
|
def get_info(self):
|
||||||
|
partial_result = super().get_info()
|
||||||
|
self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
|
||||||
|
lsi_dimensions=self.lsi_dims)
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
|
||||||
|
def __init__(self,
|
||||||
|
inpath,
|
||||||
|
lsi_dims,
|
||||||
|
outpath,
|
||||||
|
min_cluster_sizes,
|
||||||
|
min_samples,
|
||||||
|
cluster_selection_epsilons,
|
||||||
|
cluster_selection_methods
|
||||||
|
):
|
||||||
|
|
||||||
|
super().__init__(hdbscan_lsi_job,
|
||||||
|
_hdbscan_lsi_grid_sweep,
|
||||||
|
inpath,
|
||||||
|
lsi_dims,
|
||||||
|
outpath,
|
||||||
|
min_cluster_sizes,
|
||||||
|
min_samples,
|
||||||
|
cluster_selection_epsilons,
|
||||||
|
cluster_selection_methods)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class _hdbscan_lsi_grid_sweep(grid_sweep):
|
||||||
|
def __init__(self,
|
||||||
|
inpath,
|
||||||
|
outpath,
|
||||||
|
lsi_dim,
|
||||||
|
*args,
|
||||||
|
**kwargs):
|
||||||
|
print(args)
|
||||||
|
print(kwargs)
|
||||||
|
|
||||||
|
self.lsi_dim = lsi_dim
|
||||||
|
self.jobtype = hdbscan_lsi_job
|
||||||
|
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def namer(self, *args, **kwargs):
|
||||||
|
s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
|
||||||
|
s += f"_lsi-{self.lsi_dim}"
|
||||||
|
return s
|
||||||
|
|
||||||
|
def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'],lsi_dimensions='all'):
|
||||||
|
"""Run hdbscan clustering once or more with different parameters.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
||||||
|
|
||||||
|
Keword arguments:
|
||||||
|
savefile: path to save the metadata and diagnostics
|
||||||
|
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
|
||||||
|
outpath: path to output fit clusterings.
|
||||||
|
min_cluster_sizes: one or more integers indicating the minumum cluster size
|
||||||
|
min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
|
||||||
|
cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
|
||||||
|
cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters.
|
||||||
|
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
||||||
|
"""
|
||||||
|
|
||||||
|
obj = hdbscan_lsi_grid_sweep(inpath,
|
||||||
|
lsi_dimensions,
|
||||||
|
outpath,
|
||||||
|
map(int,min_cluster_sizes),
|
||||||
|
map(int,min_samples),
|
||||||
|
map(float,cluster_selection_epsilons),
|
||||||
|
cluster_selection_methods
|
||||||
|
)
|
||||||
|
|
||||||
|
obj.run(10)
|
||||||
|
obj.save(savefile)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire(run_hdbscan_lsi_grid_sweep)
|
@ -1,11 +1,9 @@
|
|||||||
from sklearn.cluster import KMeans
|
from sklearn.cluster import KMeans
|
||||||
import fire
|
import fire
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from multiprocessing import cpu_count
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
|
from clustering_base import clustering_result, clustering_job
|
||||||
from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep
|
from grid_sweep import grid_sweep
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class kmeans_clustering_result(clustering_result):
|
class kmeans_clustering_result(clustering_result):
|
||||||
@ -13,10 +11,6 @@ class kmeans_clustering_result(clustering_result):
|
|||||||
n_init:int
|
n_init:int
|
||||||
max_iter:int
|
max_iter:int
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
|
|
||||||
pass
|
|
||||||
|
|
||||||
class kmeans_job(clustering_job):
|
class kmeans_job(clustering_job):
|
||||||
def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
|
def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
|
||||||
super().__init__(infile,
|
super().__init__(infile,
|
||||||
@ -45,28 +39,13 @@ class kmeans_job(clustering_job):
|
|||||||
def get_info(self):
|
def get_info(self):
|
||||||
result = super().get_info()
|
result = super().get_info()
|
||||||
self.result = kmeans_clustering_result(**result.__dict__,
|
self.result = kmeans_clustering_result(**result.__dict__,
|
||||||
n_init=n_init,
|
n_init=self.n_init,
|
||||||
max_iter=max_iter)
|
max_iter=self.max_iter)
|
||||||
return self.result
|
return self.result
|
||||||
|
|
||||||
|
|
||||||
class kmeans_lsi_job(kmeans_job, lsi_mixin):
|
|
||||||
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
|
||||||
super().__init__(infile,
|
|
||||||
outpath,
|
|
||||||
name,
|
|
||||||
*args,
|
|
||||||
**kwargs)
|
|
||||||
super().set_lsi_dims(lsi_dims)
|
|
||||||
|
|
||||||
def get_info(self):
|
|
||||||
result = super().get_info()
|
|
||||||
self.result = kmeans_clustering_result_lsi(**result.__dict__,
|
|
||||||
lsi_dimensions=self.lsi_dims)
|
|
||||||
return self.result
|
|
||||||
|
|
||||||
|
|
||||||
class kmeans_grid_sweep(grid_sweep):
|
class kmeans_grid_sweep(grid_sweep):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
inpath,
|
inpath,
|
||||||
outpath,
|
outpath,
|
||||||
@ -80,49 +59,7 @@ class kmeans_grid_sweep(grid_sweep):
|
|||||||
max_iter):
|
max_iter):
|
||||||
return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
|
return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
|
||||||
|
|
||||||
class _kmeans_lsi_grid_sweep(grid_sweep):
|
|
||||||
def __init__(self,
|
|
||||||
inpath,
|
|
||||||
outpath,
|
|
||||||
lsi_dim,
|
|
||||||
*args,
|
|
||||||
**kwargs):
|
|
||||||
self.lsi_dim = lsi_dim
|
|
||||||
self.jobtype = kmeans_lsi_job
|
|
||||||
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
|
||||||
|
|
||||||
def namer(self, *args, **kwargs):
|
|
||||||
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
|
|
||||||
s += f"_lsi-{self.lsi_dim}"
|
|
||||||
return s
|
|
||||||
|
|
||||||
class kmeans_lsi_grid_sweep(lsi_grid_sweep):
|
|
||||||
def __init__(self,
|
|
||||||
inpath,
|
|
||||||
lsi_dims,
|
|
||||||
outpath,
|
|
||||||
n_clusters,
|
|
||||||
n_inits,
|
|
||||||
max_iters
|
|
||||||
):
|
|
||||||
|
|
||||||
super().__init__(kmeans_lsi_job,
|
|
||||||
_kmeans_lsi_grid_sweep,
|
|
||||||
inpath,
|
|
||||||
lsi_dims,
|
|
||||||
outpath,
|
|
||||||
n_clusters,
|
|
||||||
n_inits,
|
|
||||||
max_iters)
|
|
||||||
|
|
||||||
def test_select_kmeans_clustering():
|
def test_select_kmeans_clustering():
|
||||||
# select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
|
|
||||||
# "test_hdbscan_author30k",
|
|
||||||
# min_cluster_sizes=[2],
|
|
||||||
# min_samples=[1,2],
|
|
||||||
# cluster_selection_epsilons=[0,0.05,0.1,0.15],
|
|
||||||
# cluster_selection_methods=['eom','leaf'],
|
|
||||||
# lsi_dimensions='all')
|
|
||||||
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
|
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
|
||||||
outpath = "test_kmeans";
|
outpath = "test_kmeans";
|
||||||
n_clusters=[200,300,400];
|
n_clusters=[200,300,400];
|
||||||
@ -139,10 +76,30 @@ def test_select_kmeans_clustering():
|
|||||||
gs.run(20)
|
gs.run(20)
|
||||||
gs.save("test_hdbscan/lsi_sweep.csv")
|
gs.save("test_hdbscan/lsi_sweep.csv")
|
||||||
|
|
||||||
|
def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]):
|
||||||
|
"""Run kmeans clustering once or more with different parameters.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
|
||||||
|
|
||||||
|
Keword arguments:
|
||||||
|
savefile: path to save the metadata and diagnostics
|
||||||
|
inpath: path to feather data containing a labeled matrix of subreddit similarities.
|
||||||
|
outpath: path to output fit kmeans clusterings.
|
||||||
|
n_clusters: one or more numbers of kmeans clusters to select.
|
||||||
|
n_inits: one or more numbers of different initializations to use for each clustering.
|
||||||
|
max_iters: one or more numbers of different maximum interations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
obj = kmeans_grid_sweep(inpath,
|
||||||
|
outpath,
|
||||||
|
map(int,n_clusters),
|
||||||
|
map(int,n_inits),
|
||||||
|
map(int,max_iters))
|
||||||
|
|
||||||
|
|
||||||
|
obj.run(1)
|
||||||
|
obj.save(savefile)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
fire.Fire(run_kmeans_grid_sweep)
|
||||||
fire.Fire{'grid_sweep':kmeans_grid_sweep,
|
|
||||||
'grid_sweep_lsi':kmeans_lsi_grid_sweep
|
|
||||||
'cluster':kmeans_job,
|
|
||||||
'cluster_lsi':kmeans_lsi_job}
|
|
||||||
|
93
clustering/kmeans_clustering_lsi.py
Normal file
93
clustering/kmeans_clustering_lsi.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
import fire
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep
|
||||||
|
from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep
|
||||||
|
from grid_sweep import grid_sweep
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class kmeans_lsi_job(kmeans_job, lsi_mixin):
|
||||||
|
def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
|
||||||
|
super().__init__(infile,
|
||||||
|
outpath,
|
||||||
|
name,
|
||||||
|
*args,
|
||||||
|
**kwargs)
|
||||||
|
super().set_lsi_dims(lsi_dims)
|
||||||
|
|
||||||
|
def get_info(self):
|
||||||
|
result = super().get_info()
|
||||||
|
self.result = kmeans_clustering_result_lsi(**result.__dict__,
|
||||||
|
lsi_dimensions=self.lsi_dims)
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
class _kmeans_lsi_grid_sweep(grid_sweep):
|
||||||
|
def __init__(self,
|
||||||
|
inpath,
|
||||||
|
outpath,
|
||||||
|
lsi_dim,
|
||||||
|
*args,
|
||||||
|
**kwargs):
|
||||||
|
print(args)
|
||||||
|
print(kwargs)
|
||||||
|
self.lsi_dim = lsi_dim
|
||||||
|
self.jobtype = kmeans_lsi_job
|
||||||
|
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
||||||
|
|
||||||
|
def namer(self, *args, **kwargs):
|
||||||
|
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
|
||||||
|
s += f"_lsi-{self.lsi_dim}"
|
||||||
|
return s
|
||||||
|
|
||||||
|
class kmeans_lsi_grid_sweep(lsi_grid_sweep):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
inpath,
|
||||||
|
lsi_dims,
|
||||||
|
outpath,
|
||||||
|
n_clusters,
|
||||||
|
n_inits,
|
||||||
|
max_iters
|
||||||
|
):
|
||||||
|
|
||||||
|
super().__init__(kmeans_lsi_job,
|
||||||
|
_kmeans_lsi_grid_sweep,
|
||||||
|
inpath,
|
||||||
|
lsi_dims,
|
||||||
|
outpath,
|
||||||
|
n_clusters,
|
||||||
|
n_inits,
|
||||||
|
max_iters)
|
||||||
|
|
||||||
|
def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"):
|
||||||
|
"""Run kmeans clustering once or more with different parameters.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
|
||||||
|
|
||||||
|
Keword arguments:
|
||||||
|
savefile: path to save the metadata and diagnostics
|
||||||
|
inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
|
||||||
|
outpath: path to output fit kmeans clusterings.
|
||||||
|
lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
|
||||||
|
n_clusters: one or more numbers of kmeans clusters to select.
|
||||||
|
n_inits: one or more numbers of different initializations to use for each clustering.
|
||||||
|
max_iters: one or more numbers of different maximum interations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
obj = kmeans_lsi_grid_sweep(inpath,
|
||||||
|
lsi_dimensions,
|
||||||
|
outpath,
|
||||||
|
list(map(int,n_clusters)),
|
||||||
|
list(map(int,n_inits)),
|
||||||
|
list(map(int,max_iters))
|
||||||
|
)
|
||||||
|
|
||||||
|
obj.run(1)
|
||||||
|
obj.save(savefile)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
fire.Fire(run_kmeans_lsi_grid_sweep)
|
28
clustering/lsi_base.py
Normal file
28
clustering/lsi_base.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from clustering_base import clustering_job, clustering_result
|
||||||
|
from grid_sweep import grid_sweep
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from itertools import chain
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
class lsi_mixin():
|
||||||
|
def set_lsi_dims(self, lsi_dims):
|
||||||
|
self.lsi_dims = lsi_dims
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class lsi_result_mixin:
|
||||||
|
lsi_dimensions:int
|
||||||
|
|
||||||
|
class lsi_grid_sweep(grid_sweep):
|
||||||
|
def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
|
||||||
|
self.jobtype = jobtype
|
||||||
|
self.subsweep = subsweep
|
||||||
|
inpath = Path(inpath)
|
||||||
|
if lsi_dimensions == 'all':
|
||||||
|
lsi_paths = list(inpath.glob("*"))
|
||||||
|
else:
|
||||||
|
lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
|
||||||
|
|
||||||
|
lsi_nums = [p.stem for p in lsi_paths]
|
||||||
|
self.hasrun = False
|
||||||
|
self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
|
||||||
|
self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
|
Loading…
Reference in New Issue
Block a user