13
0

bugfixes in clustering selection.

This commit is contained in:
Nate E TeBlunthuis 2021-04-21 16:56:25 -07:00
parent ac06a8757a
commit 37dd0ef55f
3 changed files with 39 additions and 22 deletions

View File

@ -2,26 +2,29 @@
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
similarity_data=/gscratch/comdata/output/reddit_similarity similarity_data=/gscratch/comdata/output/reddit_similarity
clustering_data=/gscratch/comdata/output/reddit_clustering clustering_data=/gscratch/comdata/output/reddit_clustering
selection_grid="--max_iter=10000 --convergence_iter=15,30,100 --preference_quantile=0.85 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9" selection_grid="--max_iter=3000 --convergence_iter=15,30,100 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9"
all:$(clustering_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_authors-tf_similarities_30k.feather $(clustering_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_authors-tf_similarities_10k.feather $(clustering_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_10k.feather #selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
all:$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv
# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
$(clustering_data)/subreddit_comment_authors_10k.feather:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(selection_grid) -J 20 $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(selection_grid) -J 20
$(clustering_data)/subreddit_comment_terms_10k.feather:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k $(selection_grid) -J 20 $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv $(selection_grid) -J 20
$(clustering_data)/subreddit_authors-tf_similarities_10k.feather:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k $(selection_grid) -J 20 $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(selection_grid) -J 20
$(clustering_data)/subreddit_comment_authors_30k.feather:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
$(clustering_data)/subreddit_comment_terms_30k.feather:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
$(clustering_data)/subreddit_authors-tf_similarities_30k.feather:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather # $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
# $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather # $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather

View File

@ -24,7 +24,7 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000,
preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits. preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author.
''' '''
print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantilne}") print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantile}")
preference = np.quantile(mat,preference_quantile) preference = np.quantile(mat,preference_quantile)

View File

@ -6,6 +6,7 @@ from dataclasses import dataclass
from multiprocessing import Pool, cpu_count, Array, Process from multiprocessing import Pool, cpu_count, Array, Process
from pathlib import Path from pathlib import Path
from itertools import product, starmap from itertools import product, starmap
import numpy as np
import pandas as pd import pandas as pd
import fire import fire
import sys import sys
@ -23,16 +24,28 @@ class clustering_result:
alt_silhouette_score:float alt_silhouette_score:float
name:str name:str
def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat):
def sim_to_dist(mat):
dist = 1-mat
dist[dist < 0] = 0
np.fill_diagonal(dist,0)
return dist
def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False):
if name is None: if name is None:
name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{convergence_iter}" name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
print(name) print(name)
sys.stdout.flush() sys.stdout.flush()
outpath = outdir / (str(name) + ".feather") outpath = outdir / (str(name) + ".feather")
print(outpath) print(outpath)
clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose) clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
score = silhouette_score(clustering.affinity_matrix_, clustering.labels_, metric='precomputed') mat = sim_to_dist(clustering.affinity_matrix_)
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
score = silhouette_score(mat, clustering.labels_, metric='precomputed')
if alt_mat is not None:
alt_distances = sim_to_dist(alt_mat)
alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
res = clustering_result(outpath=outpath, res = clustering_result(outpath=outpath,
damping=damping, damping=damping,
@ -47,7 +60,7 @@ def do_clustering(damping, convergence_iter, preference_quantile, name, mat, sub
# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). # alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
def select_affinity_clustering(similarities, outdir, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None): def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):
damping = list(map(float,damping)) damping = list(map(float,damping))
convergence_iter = convergence_iter = list(map(int,convergence_iter)) convergence_iter = convergence_iter = list(map(int,convergence_iter))
@ -80,8 +93,9 @@ def select_affinity_clustering(similarities, outdir, damping=[0.9], max_iter=100
print("running clustering selection") print("running clustering selection")
clustering_data = pool.starmap(_do_clustering, hyper_grid) clustering_data = pool.starmap(_do_clustering, hyper_grid)
clustering_data = pd.DataFrame(list(clustering_data)) clustering_data = pd.DataFrame(list(clustering_data))
clustering_data.to_csv(outinfo)
return clustering_data return clustering_data
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire(select_affinity_clustering) x = fire.Fire(select_affinity_clustering)