1
0

Use Latent semantic indexing and hdbscan

This commit is contained in:
Nate E TeBlunthuis
2021-05-02 23:39:55 -07:00
parent 36b24ee933
commit 7df8436067
14 changed files with 835 additions and 373 deletions

View File

@@ -2,20 +2,41 @@
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
similarity_data=/gscratch/comdata/output/reddit_similarity
clustering_data=/gscratch/comdata/output/reddit_clustering
selection_grid="--max_iter=3000 --convergence_iter=15,30,100 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9"
kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]"
#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
all:$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv
all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(selection_grid) -J 20
$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
$(clustering_data)/subreddit_comment_terms_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv $(selection_grid) -J 20
$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
$(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(selection_grid) -J 20
$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
$(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid)
affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
$(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20
clean:
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv
rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv
PHONY: clean
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS

View File

@@ -3,24 +3,23 @@
import sys
import pandas as pd
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AffinityPropagation, KMeans
import fire
from pathlib import Path
from multiprocessing import cpu_count
from dataclasses import dataclass
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
def read_similarity_mat(similarities, use_threads=True):
df = pd.read_feather(similarities, use_threads=use_threads)
mat = np.array(df.drop('_subreddit',1))
n = mat.shape[0]
mat[range(n),range(n)] = 1
return (df._subreddit,mat)
def affinity_clustering(similarities, *args, **kwargs):
def affinity_clustering(similarities, output, *args, **kwargs):
subreddits, mat = read_similarity_mat(similarities)
return _affinity_clustering(mat, subreddits, *args, **kwargs)
clustering = _affinity_clustering(mat, *args, **kwargs)
cluster_data = process_clustering_result(clustering, subreddits)
cluster_data['algorithm'] = 'affinity'
return(cluster_data)
def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
'''
similarities: feather file with a dataframe of similarity scores
similarities: matrix of similarity scores
preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author.
'''
@@ -40,25 +39,32 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000,
verbose=verbose,
random_state=random_state).fit(mat)
print(f"clustering took {clustering.n_iter_} iterations")
clusters = clustering.labels_
print(f"found {len(set(clusters))} clusters")
cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
cluster_sizes = cluster_data.groupby("cluster").count()
print(f"the largest cluster has {cluster_sizes.subreddit.max()} members")
print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
sys.stdout.flush()
cluster_data = process_clustering_result(clustering, subreddits)
output = Path(output)
output.parent.mkdir(parents=True,exist_ok=True)
cluster_data.to_feather(output)
print(f"saved {output}")
return clustering
def kmeans_clustering(similarities, *args, **kwargs):
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
clustering = _kmeans_clustering(mat, *args, **kwargs)
cluster_data = process_clustering_result(clustering, subreddits)
return(cluster_data)
def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
clustering = KMeans(n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,
random_state=random_state,
verbose=verbose
).fit(mat)
return clustering
if __name__ == "__main__":
fire.Fire(affinity_clustering)

View File

@@ -0,0 +1,49 @@
from pathlib import Path
import numpy as np
import pandas as pd
from dataclasses import dataclass
def sim_to_dist(mat):
dist = 1-mat
dist[dist < 0] = 0
np.fill_diagonal(dist,0)
return dist
def process_clustering_result(clustering, subreddits):
if hasattr(clustering,'n_iter_'):
print(f"clustering took {clustering.n_iter_} iterations")
clusters = clustering.labels_
print(f"found {len(set(clusters))} clusters")
cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True)
return cluster_data
@dataclass
class clustering_result:
outpath:Path
max_iter:int
silhouette_score:float
alt_silhouette_score:float
name:str
n_clusters:int
def read_similarity_mat(similarities, use_threads=True):
df = pd.read_feather(similarities, use_threads=use_threads)
mat = np.array(df.drop('_subreddit',1))
n = mat.shape[0]
mat[range(n),range(n)] = 1
return (df._subreddit,mat)

View File

@@ -0,0 +1,172 @@
from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
from dataclasses import dataclass
import hdbscan
from sklearn.neighbors import NearestNeighbors
import plotnine as pn
import numpy as np
from itertools import product, starmap
import pandas as pd
from sklearn.metrics import silhouette_score, silhouette_samples
from pathlib import Path
from multiprocessing import Pool, cpu_count
import fire
from pyarrow.feather import write_feather
def test_select_hdbscan_clustering():
select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
"test_hdbscan_author30k",
min_cluster_sizes=[2],
min_samples=[1,2],
cluster_selection_epsilons=[0,0.05,0.1,0.15],
cluster_selection_methods=['eom','leaf'],
lsi_dimensions='all')
inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI"
outpath = "test_hdbscan";
min_cluster_sizes=[2,3,4];
min_samples=[1,2,3];
cluster_selection_epsilons=[0,0.1,0.3,0.5];
cluster_selection_methods=['eom'];
lsi_dimensions='all'
@dataclass
class hdbscan_clustering_result(clustering_result):
min_cluster_size:int
min_samples:int
cluster_selection_epsilon:float
cluster_selection_method:str
lsi_dimensions:int
n_isolates:int
silhouette_samples:str
def select_hdbscan_clustering(inpath,
outpath,
outfile=None,
min_cluster_sizes=[2],
min_samples=[1],
cluster_selection_epsilons=[0],
cluster_selection_methods=['eom'],
lsi_dimensions='all'
):
inpath = Path(inpath)
outpath = Path(outpath)
outpath.mkdir(exist_ok=True, parents=True)
if lsi_dimensions == 'all':
lsi_paths = list(inpath.glob("*"))
else:
lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions]
lsi_nums = [p.stem for p in lsi_paths]
grid = list(product(lsi_nums,
min_cluster_sizes,
min_samples,
cluster_selection_epsilons,
cluster_selection_methods))
# fix the output file names
names = list(map(lambda t:'_'.join(map(str,t)),grid))
grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)]
with Pool(int(cpu_count()/4)) as pool:
mods = starmap(hdbscan_clustering, grid)
res = pd.DataFrame(mods)
if outfile is None:
outfile = outpath / "selection_data.csv"
res.to_csv(outfile)
def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
clustering = _hdbscan_clustering(mat,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method,
metric='precomputed',
core_dist_n_jobs=cpu_count()
)
cluster_data = process_clustering_result(clustering, subreddits)
isolates = clustering.labels_ == -1
scoremat = mat[~isolates][:,~isolates]
score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed')
cluster_data.to_feather(output)
silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed')
silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp})
silsampout = output.parent / ("silhouette_samples" + output.name)
silhouette_samp.to_feather(silsampout)
result = hdbscan_clustering_result(outpath=output,
max_iter=None,
silhouette_samples=silsampout,
silhouette_score=score,
alt_silhouette_score=score,
name=name,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
cluster_selection_epsilon=cluster_selection_epsilon,
cluster_selection_method=cluster_selection_method,
lsi_dimensions=lsi_dim,
n_isolates=isolates.sum(),
n_clusters=len(set(clustering.labels_))
)
return(result)
# for all runs we should try cluster_selection_epsilon = None
# for terms we should try cluster_selection_epsilon around 0.56-0.66
# for authors we should try cluster_selection_epsilon around 0.98-0.99
def _hdbscan_clustering(mat, *args, **kwargs):
print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
print(mat)
clusterer = hdbscan.HDBSCAN(*args,
**kwargs,
)
clustering = clusterer.fit(mat.astype('double'))
return(clustering)
def KNN_distances_plot(mat,outname,k=2):
nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
distances, indices = nbrs.kneighbors(mat)
d2 = distances[:,-1]
df = pd.DataFrame({'dist':d2})
df = df.sort_values("dist",ascending=False)
df['idx'] = np.arange(0,d2.shape[0]) + 1
p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
breaks = np.arange(0,10)/10)
p.save(outname,width=16,height=10)
def make_KNN_plots():
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
subreddits, mat = read_similarity_mat(similarities)
mat = sim_to_dist(mat)
KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
if __name__ == "__main__":
df = pd.read_csv("test_hdbscan/selection_data.csv")
test_select_hdbscan_clustering()
check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering)

View File

@@ -1,8 +1,8 @@
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation
from functools import partial
from clustering import _affinity_clustering, read_similarity_mat
from dataclasses import dataclass
from clustering import _affinity_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result
from multiprocessing import Pool, cpu_count, Array, Process
from pathlib import Path
from itertools import product, starmap
@@ -12,40 +12,69 @@ import fire
import sys
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
@dataclass
class clustering_result:
outpath:Path
class affinity_clustering_result(clustering_result):
damping:float
max_iter:int
convergence_iter:int
preference_quantile:float
silhouette_score:float
alt_silhouette_score:float
name:str
def sim_to_dist(mat):
dist = 1-mat
dist[dist < 0] = 0
np.fill_diagonal(dist,0)
return dist
def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name)
sys.stdout.flush()
outpath = outdir / (str(name) + ".feather")
outpath.parent.mkdir(parents=True,exist_ok=True)
print(outpath)
clustering = _affinity_clustering(mat, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
cluster_data = process_clustering_result(clustering, subreddits)
mat = sim_to_dist(clustering.affinity_matrix_)
try:
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
except ValueError:
score = None
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
try:
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
except ValueError:
alt_score = None
res = affinity_clustering_result(outpath=outpath,
damping=damping,
max_iter=max_iter,
convergence_iter=convergence_iter,
preference_quantile=preference_quantile,
silhouette_score=score,
alt_silhouette_score=score,
name=str(name))
return res
def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name)
sys.stdout.flush()
outpath = outdir / (str(name) + ".feather")
outpath.parent.mkdir(parents=True,exist_ok=True)
print(outpath)
clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
mat = sim_to_dist(clustering.affinity_matrix_)
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
try:
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
except ValueError:
score = None
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
try:
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
except ValueError:
alt_score = None
res = clustering_result(outpath=outpath,
damping=damping,
@@ -58,6 +87,7 @@ def do_clustering(damping, convergence_iter, preference_quantile, name, mat, sub
return res
# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):
@@ -86,7 +116,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max
hyper_grid = product(damping, convergence_iter, preference_quantile)
hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))
_do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
_do_clustering = partial(do_affinity_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
# similarities = Array('d', mat)
# call pool.starmap
@@ -94,6 +124,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max
clustering_data = pool.starmap(_do_clustering, hyper_grid)
clustering_data = pd.DataFrame(list(clustering_data))
clustering_data.to_csv(outinfo)
return clustering_data

View File

@@ -0,0 +1,92 @@
from sklearn.metrics import silhouette_score
from sklearn.cluster import AffinityPropagation
from functools import partial
from clustering import _kmeans_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result
from dataclasses import dataclass
from multiprocessing import Pool, cpu_count, Array, Process
from pathlib import Path
from itertools import product, starmap
import numpy as np
import pandas as pd
import fire
import sys
@dataclass
class kmeans_clustering_result(clustering_result):
n_clusters:int
n_init:int
# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying.
def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name)
sys.stdout.flush()
outpath = outdir / (str(name) + ".feather")
print(outpath)
mat = sim_to_dist(mat)
clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose)
outpath.parent.mkdir(parents=True,exist_ok=True)
cluster_data.to_feather(outpath)
cluster_data = process_clustering_result(clustering, subreddits)
try:
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
except ValueError:
score = None
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
try:
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
except ValueError:
alt_score = None
res = kmeans_clustering_result(outpath=outpath,
max_iter=max_iter,
n_clusters=n_clusters,
n_init = n_init,
silhouette_score=score,
alt_silhouette_score=score,
name=str(name))
return res
# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
def select_kmeans_clustering(similarities, outdir, outinfo, n_clusters=[1000], max_iter=100000, n_init=10, random_state=1968, verbose=True, alt_similarities=None):
n_clusters = list(map(int,n_clusters))
n_init = list(map(int,n_init))
if type(outdir) is str:
outdir = Path(outdir)
outdir.mkdir(parents=True,exist_ok=True)
subreddits, mat = read_similarity_mat(similarities,use_threads=True)
if alt_similarities is not None:
alt_mat = read_similarity_mat(alt_similarities,use_threads=True)
else:
alt_mat = None
# get list of tuples: the combinations of hyperparameters
hyper_grid = product(n_clusters, n_init)
hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))
_do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
# call starmap
print("running clustering selection")
clustering_data = starmap(_do_clustering, hyper_grid)
clustering_data = pd.DataFrame(list(clustering_data))
clustering_data.to_csv(outinfo)
return clustering_data
if __name__ == "__main__":
x = fire.Fire(select_kmeans_clustering)

7
clustering/selection.py Normal file
View File

@@ -0,0 +1,7 @@
import fire
from select_affinity import select_affinity_clustering
from select_kmeans import select_kmeans_clustering
if __name__ == "__main__":
fire.Fire({"kmeans":select_kmeans_clustering,
"affinity":select_affinity_clustering})