update clustering scripts
This commit is contained in:
parent
87ffaa6858
commit
cf86c7492c
@ -2,9 +2,9 @@
|
||||
srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
|
||||
similarity_data=/gscratch/comdata/output/reddit_similarity
|
||||
clustering_data=/gscratch/comdata/output/reddit_clustering
|
||||
kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]"
|
||||
hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf"
|
||||
affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]"
|
||||
kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
|
||||
hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
|
||||
affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
|
||||
|
||||
authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
|
||||
authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
|
||||
@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu
|
||||
${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
|
||||
$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
|
||||
|
||||
${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
|
||||
$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
|
||||
|
||||
${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
|
||||
$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
|
||||
|
||||
clean_affinity:
|
||||
rm -f ${authors_10k_output}/affinity/selection_data.csv
|
||||
|
@ -7,6 +7,7 @@ class grid_sweep:
|
||||
def __init__(self, jobtype, inpath, outpath, namer, *args):
|
||||
self.jobtype = jobtype
|
||||
self.namer = namer
|
||||
print(*args)
|
||||
grid = list(product(*args))
|
||||
inpath = Path(inpath)
|
||||
outpath = Path(outpath)
|
||||
|
@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep):
|
||||
|
||||
self.lsi_dim = lsi_dim
|
||||
self.jobtype = hdbscan_lsi_job
|
||||
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
||||
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
|
||||
|
||||
|
||||
def namer(self, *args, **kwargs):
|
||||
@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2]
|
||||
obj = hdbscan_lsi_grid_sweep(inpath,
|
||||
lsi_dimensions,
|
||||
outpath,
|
||||
map(int,min_cluster_sizes),
|
||||
map(int,min_samples),
|
||||
map(float,cluster_selection_epsilons),
|
||||
list(map(int,min_cluster_sizes)),
|
||||
list(map(int,min_samples)),
|
||||
list(map(float,cluster_selection_epsilons)),
|
||||
cluster_selection_methods
|
||||
)
|
||||
|
||||
|
@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep):
|
||||
print(kwargs)
|
||||
self.lsi_dim = lsi_dim
|
||||
self.jobtype = kmeans_lsi_job
|
||||
super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs)
|
||||
super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
|
||||
|
||||
def namer(self, *args, **kwargs):
|
||||
s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
|
||||
|
@ -2,15 +2,15 @@ import fire
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
||||
selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv"
|
||||
selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv"
|
||||
|
||||
outpath = 'test_best.feather'
|
||||
min_clusters=50; max_isolates=5000; min_cluster_size=2
|
||||
|
||||
# pick the best clustering according to silhouette score subject to contraints
|
||||
def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
|
||||
def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
|
||||
df = pd.read_csv(selection_data,index_col=0)
|
||||
df = df.sort_values("silhouette_score")
|
||||
df = df.sort_values("silhouette_score",ascending=False)
|
||||
|
||||
# not sure I fixed the bug underlying this fully or not.
|
||||
df['n_isolates_str'] = df.n_isolates.str.strip("[]")
|
||||
@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates):
|
||||
df.loc[df.n_isolates_0,'n_isolates'] = 0
|
||||
df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
|
||||
|
||||
best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]]
|
||||
best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]]
|
||||
|
||||
print(best_cluster.to_dict())
|
||||
best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
|
||||
|
||||
shutil.copy(best_path,output)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,7 +1,38 @@
|
||||
import fire
|
||||
from select_affinity import select_affinity_clustering
|
||||
from select_kmeans import select_kmeans_clustering
|
||||
import pandas as pd
|
||||
import plotnine as pn
|
||||
from pathlib import Path
|
||||
from clustering.fit_tsne import fit_tsne
|
||||
from visualization.tsne_vis import build_visualization
|
||||
|
||||
df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
|
||||
|
||||
# plot silhouette_score as a function of isolates
|
||||
df = df.sort_values("silhouette_score")
|
||||
|
||||
df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
|
||||
p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
|
||||
p.save("isolates_x_score.png")
|
||||
|
||||
p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
|
||||
p.save("clusters_x_isolates.png")
|
||||
|
||||
# the best result for hdbscan seems like this one: it has a decent number of
|
||||
# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
|
||||
best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
|
||||
|
||||
best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
|
||||
|
||||
tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
|
||||
|
||||
if not tnse_data.exists():
|
||||
fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
|
||||
tnse_data)
|
||||
|
||||
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
|
||||
Path(best_eom.outpath)/(best_eom['name']+'.feather'),
|
||||
"./authors-tf_lsi850_best_eom.html")
|
||||
|
||||
build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
|
||||
Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
|
||||
"./authors-tf_lsi850_best_leaf.html")
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire({"kmeans":select_kmeans_clustering,
|
||||
"affinity":select_affinity_clustering})
|
||||
|
@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat
|
||||
|
||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
|
||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
|
||||
|
||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
|
||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
|
||||
|
||||
/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
|
||||
start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum
|
||||
stop-all.sh
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||
|
@ -1,11 +1,12 @@
|
||||
import pandas as pd
|
||||
from pandas.core.groupby import DataFrameGroupBy as GroupBy
|
||||
from pathlib import Path
|
||||
import fire
|
||||
import numpy as np
|
||||
import sys
|
||||
sys.path.append("..")
|
||||
sys.path.append("../similarities")
|
||||
from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
|
||||
from similarities.similarities_helper import reindex_tfidf
|
||||
|
||||
# this is the mean of the ratio of the overlap to the focal size.
|
||||
# mean shared membership per focal community member
|
||||
@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
|
||||
|
||||
def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
|
||||
df = pd.read_feather(inpath)
|
||||
df = df.drop('subreddit',1)
|
||||
df = df.drop('_subreddit',1)
|
||||
np.fill_diagonal(df.values,0)
|
||||
df = agg(df, 0).reset_index()
|
||||
df = df.rename({0:'overlap_density'},axis='columns')
|
||||
outpath = Path(outpath)
|
||||
outpath.parent.mkdir(parents=True, exist_ok = True)
|
||||
df.to_feather(outpath)
|
||||
return df
|
||||
|
||||
@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
|
||||
# exclude the diagonal
|
||||
df = df.loc[df.subreddit != df.variable]
|
||||
res = agg(df.groupby(['subreddit','week'])).reset_index()
|
||||
outpath = Path(outpath)
|
||||
outpath.parent.mkdir(parents=True, exist_ok = True)
|
||||
res.to_feather(outpath)
|
||||
return res
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/bash
|
||||
start_spark_cluster.sh
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py
|
||||
singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
|
||||
|
@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%"))
|
||||
df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
|
||||
|
||||
df = df.join(prop_nsfw,on='subreddit')
|
||||
df = df.filter(df.prop_nsfw < 0.5)
|
||||
#df = df.filter(df.prop_nsfw < 0.5)
|
||||
|
||||
win = Window.orderBy(f.col('n_comments').desc())
|
||||
df = df.withColumn('comments_rank', f.rank().over(win))
|
||||
@ -26,4 +26,4 @@ df = df.toPandas()
|
||||
|
||||
df = df.sort_values("n_comments")
|
||||
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
|
||||
df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)
|
||||
|
Loading…
Reference in New Issue
Block a user