2023-05-24 00:18:19 +00:00
|
|
|
srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 /bin/bash -c
|
|
|
|
similarity_data=../../data/reddit_similarity
|
|
|
|
clustering_data=../../data/reddit_clustering
|
2021-08-03 21:55:02 +00:00
|
|
|
kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
|
|
|
|
hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
|
|
|
|
affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
|
2020-12-25 06:38:04 +00:00
|
|
|
|
2021-05-10 20:46:49 +00:00
|
|
|
authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
|
|
|
|
authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
|
2021-04-20 18:33:54 +00:00
|
|
|
|
2023-05-24 00:18:19 +00:00
|
|
|
all:authors_tf_10k_lsi
|
2021-05-10 20:46:49 +00:00
|
|
|
|
|
|
|
authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
|
|
|
|
|
|
|
|
## LSI Models
|
|
|
|
${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
|
2023-05-24 00:18:19 +00:00
|
|
|
$(srun_singularity) -c "source ~/.bashrc; python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)"
|
2021-05-10 20:46:49 +00:00
|
|
|
|
|
|
|
${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
|
2023-05-24 00:18:19 +00:00
|
|
|
$(srun_singularity) -c "source ~/.bashrc; python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)"
|
2021-05-10 20:46:49 +00:00
|
|
|
|
|
|
|
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
2023-05-24 00:18:19 +00:00
|
|
|
$(srun_singularity) -c "source ~/.bashrc; python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)"
|
2021-05-10 20:46:49 +00:00
|
|
|
|
2021-08-03 21:55:02 +00:00
|
|
|
${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
|
2023-05-24 00:18:19 +00:00
|
|
|
$(srun_singularity) -c "source ~/.bashrc; python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2"
|
2022-06-09 00:01:27 +00:00
|
|
|
|
2023-05-24 00:18:19 +00:00
|
|
|
${authors_tf_10k_input_lsi}:
|
|
|
|
$(MAKE) -C ../similarities
|
2022-06-09 00:01:27 +00:00
|
|
|
|
2023-05-24 00:18:19 +00:00
|
|
|
clean:
|
2021-05-10 20:46:49 +00:00
|
|
|
rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
|
|
|
|
rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
|
|
|
|
rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
|
|
|
|
|
2023-05-24 00:18:19 +00:00
|
|
|
PHONY: clean
|