From a013f6718bf221fd870fcaff36d6724d736a0766 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Wed, 24 Mar 2021 17:18:30 -0700 Subject: [PATCH 01/22] export timeseries functions --- __init__.py | 2 ++ timeseries/__init__.py | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 __init__.py create mode 100644 timeseries/__init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..dbb8061 --- /dev/null +++ b/__init__.py @@ -0,0 +1,2 @@ +from .timeseries import load_clusters, load_densities, build_cluster_timeseries + diff --git a/timeseries/__init__.py b/timeseries/__init__.py new file mode 100644 index 0000000..c023c66 --- /dev/null +++ b/timeseries/__init__.py @@ -0,0 +1,2 @@ +from .choose_clusters import load_clusters, load_densities +from .cluster_timeseries import build_cluster_timeseries From 36b24ee933b95424686cfeaa2b2bd9776f23f853 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Fri, 30 Apr 2021 12:48:19 -0700 Subject: [PATCH 02/22] reindex tfidf in memory instead of using spark --- .../{selection.py => select_affinity.py} | 0 similarities/similarities_helper.py | 103 +++++++++++++----- 2 files changed, 78 insertions(+), 25 deletions(-) rename clustering/{selection.py => select_affinity.py} (100%) diff --git a/clustering/selection.py b/clustering/select_affinity.py similarity index 100% rename from clustering/selection.py rename to clustering/select_affinity.py diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 57a36ca..fd532a9 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -6,7 +6,9 @@ from pyspark.mllib.linalg.distributed import CoordinateMatrix from tempfile import TemporaryDirectory import pyarrow import pyarrow.dataset as ds +from sklearn.metrics import pairwise_distances from scipy.sparse import csr_matrix, issparse +from sklearn.decomposition import TruncatedSVD import pandas as pd import numpy as np import pathlib @@ -17,7 +19,8 @@ class tf_weight(Enum): MaxTF = 1 Norm05 = 2 -infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet" +infile = "/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet" +cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet" def reindex_tfidf_time_interval(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): term = term_colname @@ -50,30 +53,57 @@ def reindex_tfidf_time_interval(infile, term_colname, min_df=None, max_df=None, subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1 return(tempdir, subreddit_names) -def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False): +# subreddits missing after this step don't have any terms that have a high enough idf +def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, tf_family=tf_weight.MaxTF): spark = SparkSession.builder.getOrCreate() conf = spark.sparkContext.getConf() print(exclude_phrases) - tfidf = spark.read.parquet(infile) + tfidf_ds = ds.dataset(infile) if included_subreddits is None: included_subreddits = select_topN_subreddits(topN) else: included_subreddits = set(open(included_subreddits)) - if exclude_phrases == True: - tfidf = tfidf.filter(~f.col(term_colname).contains("_")) + ds_filter = ds.field("subreddit").isin(included_subreddits) - print("creating temporary parquet with matrix indicies") - tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits) + if min_df is not None: + ds_filter &= ds.field("count") >= min_df - tfidf = spark.read.parquet(tempdir.name) - subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas() + if max_df is not None: + ds_filter &= ds.field("count") <= max_df + + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + + df = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id',term_id,'relative_tf']).to_pandas() + + sub_ids = df.subreddit_id.drop_duplicates() + new_sub_ids = pd.DataFrame({'subreddit_id':old,'subreddit_id_new':new} for new, old in enumerate(sorted(sub_ids))) + df = df.merge(new_sub_ids,on='subreddit_id',how='inner',validate='many_to_one') + + new_count = df.groupby(term_id)[term_id].aggregate(new_count='count').reset_index() + df = df.merge(new_count,on=term_id,how='inner',validate='many_to_one') + + term_ids = df[term_id].drop_duplicates() + new_term_ids = pd.DataFrame({term_id:old,term_id_new:new} for new, old in enumerate(sorted(term_ids))) + + df = df.merge(new_term_ids, on=term_id, validate='many_to_one') + N_docs = sub_ids.shape[0] + + df['idf'] = np.log(N_docs/(1+df.new_count)) + 1 + + # agg terms by subreddit to make sparse tf/df vectors + if tf_family == tf_weight.MaxTF: + df["tf_idf"] = df.relative_tf * df.idf + else: # tf_fam = tf_weight.Norm05 + df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf + + subreddit_names = df.loc[:,['subreddit','subreddit_id_new']].drop_duplicates() subreddit_names = subreddit_names.sort_values("subreddit_id_new") - subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1 - spark.stop() - return (tempdir, subreddit_names) + return(df, subreddit_names) def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): @@ -82,13 +112,15 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non ''' if from_date is not None or to_date is not None: tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) - + mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname) else: - tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) + entries, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) + mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1))) + + print("loading matrix") - print("loading matrix") # mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname) - mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname) + print(f'computing similarities on mat. mat.shape:{mat.shape}') print(f"size of mat is:{mat.data.nbytes}") sims = simfunc(mat) @@ -101,7 +133,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}") sims = pd.DataFrame(sims) sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) - sims['subreddit'] = subreddit_names.subreddit.values + sims['_subreddit'] = subreddit_names.subreddit.values p = Path(outfile) @@ -110,7 +142,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet")) sims.to_feather(outfile) - tempdir.cleanup() +# tempdir.cleanup() def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'): term = term_colname @@ -135,10 +167,10 @@ def write_weekly_similarities(path, sims, week, names): sims['week'] = week p = pathlib.Path(path) if not p.is_dir(): - p.mkdir() + p.mkdir(exist_ok=True,parents=True) # reformat as a pairwise list - sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values) + sims = sims.melt(id_vars=['_subreddit','week'],value_vars=names.subreddit.values) sims.to_parquet(p / week.isoformat()) def column_overlaps(mat): @@ -150,11 +182,29 @@ def column_overlaps(mat): return intersection / den +# n_components is the latent dimensionality. sklearn recommends 100. More might be better +# if algorithm is 'random' instead of 'arpack' then n_iter gives the number of iterations. +# this function takes the svd and then the column similarities of it +def lsi_column_similarities(tfidfmat,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): + # first compute the lsi of the matrix + # then take the column similarities + svd = TruncatedSVD(n_components=n_components,random_state=random_state,algorithm='arpack') + mod = svd.fit(tfidfmat.T) + lsimat = mod.transform(tfidfmat.T) + sims = column_similarities(lsimat) + return sims + + def column_similarities(mat): - norm = np.matrix(np.power(mat.power(2).sum(axis=0),0.5,dtype=np.float32)) - mat = mat.multiply(1/norm) - sims = mat.T @ mat - return(sims) + return 1 - pairwise_distances(mat,metric='cosine') + # if issparse(mat): + # norm = np.matrix(np.power(mat.power(2).sum(axis=0),0.5,dtype=np.float32)) + # mat = mat.multiply(1/norm) + # else: + # norm = np.matrix(np.power(np.power(mat,2).sum(axis=0),0.5,dtype=np.float32)) + # mat = np.multiply(mat,1/norm) + # sims = mat.T @ mat + # return(sims) def prep_tfidf_entries_weekly(tfidf, term_colname, min_df, max_df, included_subreddits): @@ -202,7 +252,8 @@ def prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits) if min_df is None: min_df = 0.1 * len(included_subreddits) - tfidf = tfidf.filter(f.col('count') >= min_df) + + tfidf = tfidf.filter(f.col('count') >= min_df) if max_df is not None: tfidf = tfidf.filter(f.col('count') <= max_df) @@ -392,3 +443,5 @@ def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarit rankdf = pd.read_csv(path) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) return included_subreddits + + From 7df8436067dba9a9e6867424002d01593e4bcd25 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Sun, 2 May 2021 23:39:55 -0700 Subject: [PATCH 03/22] Use Latent semantic indexing and hdbscan --- clustering/Makefile | 37 +- clustering/clustering.py | 60 +-- clustering/clustering_base.py | 49 +++ clustering/hdbscan_clustering.py | 172 +++++++++ clustering/select_affinity.py | 69 +++- clustering/select_kmeans.py | 92 +++++ clustering/selection.py | 7 + similarities/Makefile | 133 ++++++- similarities/cosine_similarities.py | 3 +- similarities/job_script.sh | 4 +- similarities/lsi_similarities.py | 61 +++ similarities/similarities_helper.py | 426 +++++++++------------ similarities/tfidf.py | 5 +- similarities/weekly_cosine_similarities.py | 90 ++--- 14 files changed, 835 insertions(+), 373 deletions(-) create mode 100644 clustering/clustering_base.py create mode 100644 clustering/hdbscan_clustering.py create mode 100644 clustering/select_kmeans.py create mode 100644 clustering/selection.py create mode 100644 similarities/lsi_similarities.py diff --git a/clustering/Makefile b/clustering/Makefile index 338f0a6..d09cfd9 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -2,20 +2,41 @@ srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering -selection_grid="--max_iter=3000 --convergence_iter=15,30,100 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9" +kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]" #selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" -all:$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv +all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS -$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py - $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(selection_grid) -J 20 +$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py + $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid) -$(clustering_data)/subreddit_comment_terms_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py - $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv $(selection_grid) -J 20 +$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py + $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid) -$(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather - $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(selection_grid) -J 20 +$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather + $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid) + + +affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" +$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py + $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 + +$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py + $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 + +$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather + $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 + +clean: + rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv + rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv + rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv + rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv + rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv + rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv + +PHONY: clean # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS diff --git a/clustering/clustering.py b/clustering/clustering.py index 153a5c9..85be3fe 100755 --- a/clustering/clustering.py +++ b/clustering/clustering.py @@ -3,24 +3,23 @@ import sys import pandas as pd import numpy as np -from sklearn.cluster import AffinityPropagation +from sklearn.cluster import AffinityPropagation, KMeans import fire from pathlib import Path +from multiprocessing import cpu_count +from dataclasses import dataclass +from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat -def read_similarity_mat(similarities, use_threads=True): - df = pd.read_feather(similarities, use_threads=use_threads) - mat = np.array(df.drop('_subreddit',1)) - n = mat.shape[0] - mat[range(n),range(n)] = 1 - return (df._subreddit,mat) - -def affinity_clustering(similarities, *args, **kwargs): +def affinity_clustering(similarities, output, *args, **kwargs): subreddits, mat = read_similarity_mat(similarities) - return _affinity_clustering(mat, subreddits, *args, **kwargs) + clustering = _affinity_clustering(mat, *args, **kwargs) + cluster_data = process_clustering_result(clustering, subreddits) + cluster_data['algorithm'] = 'affinity' + return(cluster_data) def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): ''' - similarities: feather file with a dataframe of similarity scores + similarities: matrix of similarity scores preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits. damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. ''' @@ -40,25 +39,32 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, verbose=verbose, random_state=random_state).fit(mat) - - print(f"clustering took {clustering.n_iter_} iterations") - clusters = clustering.labels_ - - print(f"found {len(set(clusters))} clusters") - - cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) - - cluster_sizes = cluster_data.groupby("cluster").count() - print(f"the largest cluster has {cluster_sizes.subreddit.max()} members") - - print(f"the median cluster has {cluster_sizes.subreddit.median()} members") - - print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member") - - sys.stdout.flush() + cluster_data = process_clustering_result(clustering, subreddits) + output = Path(output) + output.parent.mkdir(parents=True,exist_ok=True) cluster_data.to_feather(output) print(f"saved {output}") return clustering +def kmeans_clustering(similarities, *args, **kwargs): + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + clustering = _kmeans_clustering(mat, *args, **kwargs) + cluster_data = process_clustering_result(clustering, subreddits) + return(cluster_data) + +def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): + + clustering = KMeans(n_clusters=n_clusters, + n_init=n_init, + max_iter=max_iter, + random_state=random_state, + verbose=verbose + ).fit(mat) + + return clustering + + + if __name__ == "__main__": fire.Fire(affinity_clustering) diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py new file mode 100644 index 0000000..1d86438 --- /dev/null +++ b/clustering/clustering_base.py @@ -0,0 +1,49 @@ +from pathlib import Path +import numpy as np +import pandas as pd +from dataclasses import dataclass + +def sim_to_dist(mat): + dist = 1-mat + dist[dist < 0] = 0 + np.fill_diagonal(dist,0) + return dist + +def process_clustering_result(clustering, subreddits): + + if hasattr(clustering,'n_iter_'): + print(f"clustering took {clustering.n_iter_} iterations") + + clusters = clustering.labels_ + + print(f"found {len(set(clusters))} clusters") + + cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) + + cluster_sizes = cluster_data.groupby("cluster").count().reset_index() + print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members") + + print(f"the median cluster has {cluster_sizes.subreddit.median()} members") + + print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member") + + print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True) + + return cluster_data + + +@dataclass +class clustering_result: + outpath:Path + max_iter:int + silhouette_score:float + alt_silhouette_score:float + name:str + n_clusters:int + +def read_similarity_mat(similarities, use_threads=True): + df = pd.read_feather(similarities, use_threads=use_threads) + mat = np.array(df.drop('_subreddit',1)) + n = mat.shape[0] + mat[range(n),range(n)] = 1 + return (df._subreddit,mat) diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py new file mode 100644 index 0000000..888554a --- /dev/null +++ b/clustering/hdbscan_clustering.py @@ -0,0 +1,172 @@ +from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from dataclasses import dataclass +import hdbscan +from sklearn.neighbors import NearestNeighbors +import plotnine as pn +import numpy as np +from itertools import product, starmap +import pandas as pd +from sklearn.metrics import silhouette_score, silhouette_samples +from pathlib import Path +from multiprocessing import Pool, cpu_count +import fire +from pyarrow.feather import write_feather + +def test_select_hdbscan_clustering(): + select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + "test_hdbscan_author30k", + min_cluster_sizes=[2], + min_samples=[1,2], + cluster_selection_epsilons=[0,0.05,0.1,0.15], + cluster_selection_methods=['eom','leaf'], + lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI" + outpath = "test_hdbscan"; + min_cluster_sizes=[2,3,4]; + min_samples=[1,2,3]; + cluster_selection_epsilons=[0,0.1,0.3,0.5]; + cluster_selection_methods=['eom']; + lsi_dimensions='all' + +@dataclass +class hdbscan_clustering_result(clustering_result): + min_cluster_size:int + min_samples:int + cluster_selection_epsilon:float + cluster_selection_method:str + lsi_dimensions:int + n_isolates:int + silhouette_samples:str + +def select_hdbscan_clustering(inpath, + outpath, + outfile=None, + min_cluster_sizes=[2], + min_samples=[1], + cluster_selection_epsilons=[0], + cluster_selection_methods=['eom'], + lsi_dimensions='all' + ): + + inpath = Path(inpath) + outpath = Path(outpath) + outpath.mkdir(exist_ok=True, parents=True) + + if lsi_dimensions == 'all': + lsi_paths = list(inpath.glob("*")) + + else: + lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] + + lsi_nums = [p.stem for p in lsi_paths] + grid = list(product(lsi_nums, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods)) + + # fix the output file names + names = list(map(lambda t:'_'.join(map(str,t)),grid)) + + grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] + + with Pool(int(cpu_count()/4)) as pool: + mods = starmap(hdbscan_clustering, grid) + + res = pd.DataFrame(mods) + if outfile is None: + outfile = outpath / "selection_data.csv" + + res.to_csv(outfile) + +def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + clustering = _hdbscan_clustering(mat, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method, + metric='precomputed', + core_dist_n_jobs=cpu_count() + ) + + cluster_data = process_clustering_result(clustering, subreddits) + isolates = clustering.labels_ == -1 + scoremat = mat[~isolates][:,~isolates] + score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') + cluster_data.to_feather(output) + + silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') + silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) + silsampout = output.parent / ("silhouette_samples" + output.name) + silhouette_samp.to_feather(silsampout) + + result = hdbscan_clustering_result(outpath=output, + max_iter=None, + silhouette_samples=silsampout, + silhouette_score=score, + alt_silhouette_score=score, + name=name, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method, + lsi_dimensions=lsi_dim, + n_isolates=isolates.sum(), + n_clusters=len(set(clustering.labels_)) + ) + + + + return(result) + +# for all runs we should try cluster_selection_epsilon = None +# for terms we should try cluster_selection_epsilon around 0.56-0.66 +# for authors we should try cluster_selection_epsilon around 0.98-0.99 +def _hdbscan_clustering(mat, *args, **kwargs): + print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") + + print(mat) + clusterer = hdbscan.HDBSCAN(*args, + **kwargs, + ) + + clustering = clusterer.fit(mat.astype('double')) + + return(clustering) + +def KNN_distances_plot(mat,outname,k=2): + nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) + distances, indices = nbrs.kneighbors(mat) + d2 = distances[:,-1] + df = pd.DataFrame({'dist':d2}) + df = df.sort_values("dist",ascending=False) + df['idx'] = np.arange(0,d2.shape[0]) + 1 + p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50, + breaks = np.arange(0,10)/10) + p.save(outname,width=16,height=10) + +def make_KNN_plots(): + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + + KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') + +if __name__ == "__main__": + df = pd.read_csv("test_hdbscan/selection_data.csv") + test_select_hdbscan_clustering() + check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) diff --git a/clustering/select_affinity.py b/clustering/select_affinity.py index 520857d..b8bd13a 100644 --- a/clustering/select_affinity.py +++ b/clustering/select_affinity.py @@ -1,8 +1,8 @@ from sklearn.metrics import silhouette_score from sklearn.cluster import AffinityPropagation from functools import partial -from clustering import _affinity_clustering, read_similarity_mat from dataclasses import dataclass +from clustering import _affinity_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result from multiprocessing import Pool, cpu_count, Array, Process from pathlib import Path from itertools import product, starmap @@ -12,40 +12,69 @@ import fire import sys # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. - @dataclass -class clustering_result: - outpath:Path +class affinity_clustering_result(clustering_result): damping:float - max_iter:int convergence_iter:int preference_quantile:float - silhouette_score:float - alt_silhouette_score:float - name:str - -def sim_to_dist(mat): - dist = 1-mat - dist[dist < 0] = 0 - np.fill_diagonal(dist,0) - return dist - -def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): +def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): if name is None: name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" print(name) sys.stdout.flush() outpath = outdir / (str(name) + ".feather") + outpath.parent.mkdir(parents=True,exist_ok=True) + print(outpath) + clustering = _affinity_clustering(mat, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose) + cluster_data = process_clustering_result(clustering, subreddits) + mat = sim_to_dist(clustering.affinity_matrix_) + + try: + score = silhouette_score(mat, clustering.labels_, metric='precomputed') + except ValueError: + score = None + + if alt_mat is not None: + alt_distances = sim_to_dist(alt_mat) + try: + alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') + except ValueError: + alt_score = None + + res = affinity_clustering_result(outpath=outpath, + damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + preference_quantile=preference_quantile, + silhouette_score=score, + alt_silhouette_score=score, + name=str(name)) + + return res + +def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): + if name is None: + name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" + print(name) + sys.stdout.flush() + outpath = outdir / (str(name) + ".feather") + outpath.parent.mkdir(parents=True,exist_ok=True) print(outpath) clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose) mat = sim_to_dist(clustering.affinity_matrix_) - score = silhouette_score(mat, clustering.labels_, metric='precomputed') + try: + score = silhouette_score(mat, clustering.labels_, metric='precomputed') + except ValueError: + score = None if alt_mat is not None: alt_distances = sim_to_dist(alt_mat) - alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') + try: + alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') + except ValueError: + alt_score = None res = clustering_result(outpath=outpath, damping=damping, @@ -58,6 +87,7 @@ def do_clustering(damping, convergence_iter, preference_quantile, name, mat, sub return res + # alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None): @@ -86,7 +116,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max hyper_grid = product(damping, convergence_iter, preference_quantile) hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid)) - _do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) + _do_clustering = partial(do_affinity_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) # similarities = Array('d', mat) # call pool.starmap @@ -94,6 +124,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max clustering_data = pool.starmap(_do_clustering, hyper_grid) clustering_data = pd.DataFrame(list(clustering_data)) clustering_data.to_csv(outinfo) + return clustering_data diff --git a/clustering/select_kmeans.py b/clustering/select_kmeans.py new file mode 100644 index 0000000..b07a108 --- /dev/null +++ b/clustering/select_kmeans.py @@ -0,0 +1,92 @@ +from sklearn.metrics import silhouette_score +from sklearn.cluster import AffinityPropagation +from functools import partial +from clustering import _kmeans_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result +from dataclasses import dataclass +from multiprocessing import Pool, cpu_count, Array, Process +from pathlib import Path +from itertools import product, starmap +import numpy as np +import pandas as pd +import fire +import sys + +@dataclass +class kmeans_clustering_result(clustering_result): + n_clusters:int + n_init:int + + +# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. + +def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): + if name is None: + name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" + print(name) + sys.stdout.flush() + outpath = outdir / (str(name) + ".feather") + print(outpath) + mat = sim_to_dist(mat) + clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose) + + outpath.parent.mkdir(parents=True,exist_ok=True) + cluster_data.to_feather(outpath) + cluster_data = process_clustering_result(clustering, subreddits) + + try: + score = silhouette_score(mat, clustering.labels_, metric='precomputed') + except ValueError: + score = None + + if alt_mat is not None: + alt_distances = sim_to_dist(alt_mat) + try: + alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') + except ValueError: + alt_score = None + + res = kmeans_clustering_result(outpath=outpath, + max_iter=max_iter, + n_clusters=n_clusters, + n_init = n_init, + silhouette_score=score, + alt_silhouette_score=score, + name=str(name)) + + return res + + +# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). +def select_kmeans_clustering(similarities, outdir, outinfo, n_clusters=[1000], max_iter=100000, n_init=10, random_state=1968, verbose=True, alt_similarities=None): + + n_clusters = list(map(int,n_clusters)) + n_init = list(map(int,n_init)) + + if type(outdir) is str: + outdir = Path(outdir) + + outdir.mkdir(parents=True,exist_ok=True) + + subreddits, mat = read_similarity_mat(similarities,use_threads=True) + + if alt_similarities is not None: + alt_mat = read_similarity_mat(alt_similarities,use_threads=True) + else: + alt_mat = None + + # get list of tuples: the combinations of hyperparameters + hyper_grid = product(n_clusters, n_init) + hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid)) + + _do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) + + # call starmap + print("running clustering selection") + clustering_data = starmap(_do_clustering, hyper_grid) + clustering_data = pd.DataFrame(list(clustering_data)) + clustering_data.to_csv(outinfo) + + return clustering_data + +if __name__ == "__main__": + x = fire.Fire(select_kmeans_clustering) diff --git a/clustering/selection.py b/clustering/selection.py new file mode 100644 index 0000000..d2fa6de --- /dev/null +++ b/clustering/selection.py @@ -0,0 +1,7 @@ +import fire +from select_affinity import select_affinity_clustering +from select_kmeans import select_kmeans_clustering + +if __name__ == "__main__": + fire.Fire({"kmeans":select_kmeans_clustering, + "affinity":select_affinity_clustering}) diff --git a/similarities/Makefile b/similarities/Makefile index 0ec0342..cfe8a49 100644 --- a/similarities/Makefile +++ b/similarities/Makefile @@ -1,25 +1,130 @@ -all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms.parquet +#all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet +srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh +srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh +base_data=/gscratch/comdata/output/ +similarity_data=${base_data}/reddit_similarity +tfidf_data=${similarity_data}/tfidf +tfidf_weekly_data=${similarity_data}/tfidf_weekly +similarity_weekly_data=${similarity_data}/weekly +lsi_components=[10,50,100,200,300,400,500,600,700,850,1000,1500] + +lsi_similarities: ${similarity_data}/subreddit_comment_terms_10k_LSI ${similarity_data}/subreddit_comment_authors-tf_10k_LSI ${similarity_data}/subreddit_comment_authors_10k_LSI ${similarity_data}/subreddit_comment_terms_30k_LSI ${similarity_data}/subreddit_comment_authors-tf_30k_LSI ${similarity_data}/subreddit_comment_authors_30k_LSI + +all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet + +#${tfidf_weekly_data}/comment_terms_100k.parquet ${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet ${similarity_weekly_data}/comment_authors_100k.parquet ${similarity_weekly_data}/comment_terms_30k.parquet ${similarity_weekly_data}/comment_authors_30k.parquet + +# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_weekly_130k.parquet # all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet +${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms.parquet + ${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet -# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet -# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather +${similarity_data}/subreddit_comment_terms_10k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000 -/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv - start_spark_and_run.sh 1 tfidf.py terms --topN=10000 +${similarity_data}/subreddit_comment_terms_10k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py + ${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200 -/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv - start_spark_and_run.sh 1 tfidf.py authors --topN=10000 +${similarity_data}/subreddit_comment_terms_30k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py + ${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200 -/gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather +${similarity_data}/subreddit_comment_terms_30k.feather: ${tfidf_data}/comment_terms_30k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000 -/gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet - start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather +${similarity_data}/subreddit_comment_authors_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000 -# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet +${similarity_data}/subreddit_comment_authors_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000 + +${similarity_data}/subreddit_comment_authors_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2 + +${similarity_data}/subreddit_comment_authors_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2 + +${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000 + +${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000 + +${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2 + +${similarity_data}/subreddit_comment_authors-tf_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2 + +${similarity_data}/subreddit_comment_terms_100k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000 + +${similarity_data}/subreddit_comment_authors_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000 + +${similarity_data}/subreddit_comment_authors-tf_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000 + +${tfidf_data}/comment_terms_100k.feather/: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv + mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 4 tfidf.py terms --topN=100000 --outpath=${tfidf_data}/comment_terms_100k.feather + +${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv + mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 4 tfidf.py terms --topN=30000 --outpath=${tfidf_data}/comment_terms_30k.feather + +${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv + mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 4 tfidf.py terms --topN=10000 --outpath=${tfidf_data}/comment_terms_10k.feather + +${tfidf_data}/comment_authors_100k.feather: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv + mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 4 tfidf.py authors --topN=100000 --outpath=${tfidf_data}/comment_authors_100k.feather + +${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv + mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 4 tfidf.py authors --topN=10000 --outpath=${tfidf_data}/comment_authors_10k.parquet + +${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv + mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 4 tfidf.py authors --topN=30000 --outpath=${tfidf_data}/comment_authors_30k.parquet + +${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv + start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet + +${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_ppnum_comments.csv + start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=100000 --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet + +${tfidf_weekly_data}/comment_terms_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv + start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet + +${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv + start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet + +${similarity_weekly_data}/comment_terms_100k.parquet: weekly_cosine_similarities.py similarities_helper.py ${tfidf_weekly_data}/comment_terms_100k.parquet + ${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet + +${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_100k.parquet + ${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet + +${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms_30k.parquet + ${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet + +${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_30k.parquet + ${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet + +# ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv +# start_spark_and_run.sh 1 tfidf.py authors_weekly --topN=130000 + +# /gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet +# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather + +# /gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet +# start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather + +# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py ${tfidf_weekly_data}/comment_authors.parquet # start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet -/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet - start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet +# /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet +# start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index 38b1d7c..0c9c986 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -2,12 +2,13 @@ import pandas as pd import fire from pathlib import Path from similarities_helper import similarities, column_similarities +from functools import partial def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) - +# change so that these take in an input as an optional argument (for speed, but also for idf). def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): return cosine_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', diff --git a/similarities/job_script.sh b/similarities/job_script.sh index 03e77de..1f363cd 100755 --- a/similarities/job_script.sh +++ b/similarities/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather -stop-all.sh +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000 +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh diff --git a/similarities/lsi_similarities.py b/similarities/lsi_similarities.py new file mode 100644 index 0000000..7ab7e8c --- /dev/null +++ b/similarities/lsi_similarities.py @@ -0,0 +1,61 @@ +import pandas as pd +import fire +from pathlib import Path +from similarities_helper import similarities, lsi_column_similarities +from functools import partial + +def lsi_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack'): + print(n_components,flush=True) + + simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm) + + return similarities(infile=infile, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + +# change so that these take in an input as an optional argument (for speed, but also for idf). +def term_lsi_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): + + return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', + 'term', + outfile, + min_df, + max_df, + included_subreddits, + topN, + from_date, + to_date, + n_components=n_components + ) + +def author_lsi_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): + return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', + 'author', + outfile, + min_df, + max_df, + included_subreddits, + topN, + from_date=from_date, + to_date=to_date, + n_components=n_components + ) + +def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): + return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', + 'author', + outfile, + min_df, + max_df, + included_subreddits, + topN, + from_date=from_date, + to_date=to_date, + tfidf_colname='relative_tf', + n_components=n_components + ) + + +if __name__ == "__main__": + fire.Fire({'term':term_lsi_similarities, + 'author':author_lsi_similarities, + 'author-tf':author_tf_similarities}) + diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index fd532a9..7f8a639 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -2,6 +2,7 @@ from pyspark.sql import SparkSession from pyspark.sql import Window from pyspark.sql import functions as f from enum import Enum +from multiprocessing import cpu_count, Pool from pyspark.mllib.linalg.distributed import CoordinateMatrix from tempfile import TemporaryDirectory import pyarrow @@ -19,46 +20,16 @@ class tf_weight(Enum): MaxTF = 1 Norm05 = 2 -infile = "/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet" +infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet" cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet" -def reindex_tfidf_time_interval(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - spark = SparkSession.builder.getOrCreate() - conf = spark.sparkContext.getConf() - print(exclude_phrases) - tfidf_weekly = spark.read.parquet(infile) - - # create the time interval - if from_date is not None: - if type(from_date) is str: - from_date = datetime.fromisoformat(from_date) - - tfidf_weekly = tfidf_weekly.filter(tfidf_weekly.week >= from_date) - - if to_date is not None: - if type(to_date) is str: - to_date = datetime.fromisoformat(to_date) - tfidf_weekly = tfidf_weekly.filter(tfidf_weekly.week < to_date) - - tfidf = tfidf_weekly.groupBy(["subreddit","week", term_id, term]).agg(f.sum("tf").alias("tf")) - tfidf = _calc_tfidf(tfidf, term_colname, tf_weight.Norm05) - tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits) - tfidf = spark.read_parquet(tempdir.name) - subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas() - subreddit_names = subreddit_names.sort_values("subreddit_id_new") - subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1 - return(tempdir, subreddit_names) +def termauthor_tfidf(term_tfidf_callable, author_tfidf_callable): + # subreddits missing after this step don't have any terms that have a high enough idf -def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, tf_family=tf_weight.MaxTF): - spark = SparkSession.builder.getOrCreate() - conf = spark.sparkContext.getConf() - print(exclude_phrases) - +# try rewriting without merges +def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF): + print("loading tfidf", flush=True) tfidf_ds = ds.dataset(infile) if included_subreddits is None: @@ -74,94 +45,116 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre if max_df is not None: ds_filter &= ds.field("count") <= max_df + if week is not None: + ds_filter &= ds.field("week") == week + + if from_date is not None: + ds_filter &= ds.field("week") >= from_date + + if to_date is not None: + ds_filter &= ds.field("week") <= to_date + + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + + projection = { + 'subreddit_id':ds.field('subreddit_id'), + term_id:ds.field(term_id), + 'relative_tf':ds.field("relative_tf").cast('float32') + } + + if not rescale_idf: + projection = { + 'subreddit_id':ds.field('subreddit_id'), + term_id:ds.field(term_id), + 'relative_tf':ds.field('relative_tf').cast('float32'), + 'tf_idf':ds.field('tf_idf').cast('float32')} + + tfidf_ds = ds.dataset(infile) + + df = tfidf_ds.to_table(filter=ds_filter,columns=projection) + + df = df.to_pandas(split_blocks=True,self_destruct=True) + print("assigning indexes",flush=True) + df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + grouped = df.groupby(term_id) + df[term_id_new] = grouped.ngroup() + + if rescale_idf: + print("computing idf", flush=True) + df['new_count'] = grouped[term_id].transform('count') + N_docs = df.subreddit_id_new.max() + 1 + df['idf'] = np.log(N_docs/(1+df.new_count),dtype='float32') + 1 + if tf_family == tf_weight.MaxTF: + df["tf_idf"] = df.relative_tf * df.idf + else: # tf_fam = tf_weight.Norm05 + df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf + + print("assigning names") + subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id']) + batches = subreddit_names.to_batches() + + with Pool(cpu_count()) as pool: + chunks = pool.imap_unordered(pull_names,batches) + subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() + + subreddit_names = subreddit_names.set_index("subreddit_id") + new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() + new_ids = new_ids.set_index('subreddit_id') + subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() + subreddit_names = subreddit_names.drop("subreddit_id",1) + subreddit_names = subreddit_names.sort_values("subreddit_id_new") + return(df, subreddit_names) + +def pull_names(batch): + return(batch.to_pandas().drop_duplicates()) + +def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): + ''' + tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. + ''' + + def proc_sims(sims, outfile): + if issparse(sims): + sims = sims.todense() + + print(f"shape of sims:{sims.shape}") + print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}",flush=True) + sims = pd.DataFrame(sims) + sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) + sims['_subreddit'] = subreddit_names.subreddit.values + + p = Path(outfile) + + output_feather = Path(str(p).replace("".join(p.suffixes), ".feather")) + output_csv = Path(str(p).replace("".join(p.suffixes), ".csv")) + output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet")) + outfile.parent.mkdir(exist_ok=True, parents=True) + + sims.to_feather(outfile) + term = term_colname term_id = term + '_id' term_id_new = term + '_id_new' - df = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id',term_id,'relative_tf']).to_pandas() - - sub_ids = df.subreddit_id.drop_duplicates() - new_sub_ids = pd.DataFrame({'subreddit_id':old,'subreddit_id_new':new} for new, old in enumerate(sorted(sub_ids))) - df = df.merge(new_sub_ids,on='subreddit_id',how='inner',validate='many_to_one') - - new_count = df.groupby(term_id)[term_id].aggregate(new_count='count').reset_index() - df = df.merge(new_count,on=term_id,how='inner',validate='many_to_one') - - term_ids = df[term_id].drop_duplicates() - new_term_ids = pd.DataFrame({term_id:old,term_id_new:new} for new, old in enumerate(sorted(term_ids))) - - df = df.merge(new_term_ids, on=term_id, validate='many_to_one') - N_docs = sub_ids.shape[0] - - df['idf'] = np.log(N_docs/(1+df.new_count)) + 1 - - # agg terms by subreddit to make sparse tf/df vectors - if tf_family == tf_weight.MaxTF: - df["tf_idf"] = df.relative_tf * df.idf - else: # tf_fam = tf_weight.Norm05 - df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf - - subreddit_names = df.loc[:,['subreddit','subreddit_id_new']].drop_duplicates() - subreddit_names = subreddit_names.sort_values("subreddit_id_new") - return(df, subreddit_names) - - -def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): - ''' - tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. - ''' - if from_date is not None or to_date is not None: - tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date) - mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname) - else: - entries, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False) - mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1))) + entries, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date) + mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new))) print("loading matrix") # mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname) print(f'computing similarities on mat. mat.shape:{mat.shape}') - print(f"size of mat is:{mat.data.nbytes}") + print(f"size of mat is:{mat.data.nbytes}",flush=True) sims = simfunc(mat) del mat - if issparse(sims): - sims = sims.todense() - - print(f"shape of sims:{sims.shape}") - print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}") - sims = pd.DataFrame(sims) - sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) - sims['_subreddit'] = subreddit_names.subreddit.values - - p = Path(outfile) - - output_feather = Path(str(p).replace("".join(p.suffixes), ".feather")) - output_csv = Path(str(p).replace("".join(p.suffixes), ".csv")) - output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet")) - - sims.to_feather(outfile) -# tempdir.cleanup() - -def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - dataset = ds.dataset(path,format='parquet') - entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas() - return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1)))) - -def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - dataset = ds.dataset(path,format='parquet') - print(f"tfidf_colname:{tfidf_colname}") - entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas() - return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))) - + if hasattr(sims,'__next__'): + for simmat, name in sims: + proc_sims(simmat, Path(outfile)/(str(name) + ".feather")) + else: + proc_sims(simmat, outfile) def write_weekly_similarities(path, sims, week, names): sims['week'] = week @@ -182,155 +175,62 @@ def column_overlaps(mat): return intersection / den +def test_lsi_sims(): + term = "term" + term_id = term + '_id' + term_id_new = term + '_id_new' + + t1 = time.perf_counter() + entries, subreddit_names = reindex_tfidf("/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k_repartitioned.parquet", + term_colname='term', + min_df=2000, + topN=10000 + ) + t2 = time.perf_counter() + print(f"first load took:{t2 - t1}s") + + entries, subreddit_names = reindex_tfidf("/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet", + term_colname='term', + min_df=2000, + topN=10000 + ) + t3=time.perf_counter() + + print(f"second load took:{t3 - t2}s") + + mat = csr_matrix((entries['tf_idf'],(entries[term_id_new], entries.subreddit_id_new))) + sims = list(lsi_column_similarities(mat, [10,50])) + sims_og = sims + sims_test = list(lsi_column_similarities(mat,[10,50],algorithm='randomized',n_iter=10)) + # n_components is the latent dimensionality. sklearn recommends 100. More might be better -# if algorithm is 'random' instead of 'arpack' then n_iter gives the number of iterations. +# if n_components is a list we'll return a list of similarities with different latent dimensionalities +# if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations. # this function takes the svd and then the column similarities of it -def lsi_column_similarities(tfidfmat,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): +def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized'): # first compute the lsi of the matrix # then take the column similarities - svd = TruncatedSVD(n_components=n_components,random_state=random_state,algorithm='arpack') + print("running LSI",flush=True) + + if type(n_components) is int: + n_components = [n_components] + + n_components = sorted(n_components,reverse=True) + + svd_components = n_components[0] + svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) mod = svd.fit(tfidfmat.T) lsimat = mod.transform(tfidfmat.T) - sims = column_similarities(lsimat) - return sims + for n_dims in n_components: + sims = column_similarities(lsimat[:,np.arange(n_dims)]) + if len(n_components) > 1: + yield (sims, n_dims) + else: + return sims def column_similarities(mat): return 1 - pairwise_distances(mat,metric='cosine') - # if issparse(mat): - # norm = np.matrix(np.power(mat.power(2).sum(axis=0),0.5,dtype=np.float32)) - # mat = mat.multiply(1/norm) - # else: - # norm = np.matrix(np.power(np.power(mat,2).sum(axis=0),0.5,dtype=np.float32)) - # mat = np.multiply(mat,1/norm) - # sims = mat.T @ mat - # return(sims) - - -def prep_tfidf_entries_weekly(tfidf, term_colname, min_df, max_df, included_subreddits): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - if min_df is None: - min_df = 0.1 * len(included_subreddits) - tfidf = tfidf.filter(f.col('count') >= min_df) - if max_df is not None: - tfidf = tfidf.filter(f.col('count') <= max_df) - - tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits)) - - # we might not have the same terms or subreddits each week, so we need to make unique ids for each week. - sub_ids = tfidf.select(['subreddit_id','week']).distinct() - sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.partitionBy('week').orderBy("subreddit_id"))) - tfidf = tfidf.join(sub_ids,['subreddit_id','week']) - - # only use terms in at least min_df included subreddits in a given week - new_count = tfidf.groupBy([term_id,'week']).agg(f.count(term_id).alias('new_count')) - tfidf = tfidf.join(new_count,[term_id,'week'],how='inner') - - # reset the term ids - term_ids = tfidf.select([term_id,'week']).distinct() - term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.partitionBy('week').orderBy(term_id))) - tfidf = tfidf.join(term_ids,[term_id,'week']) - - tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old") - tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float')) - - tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.') - - tfidf = tfidf.repartition('week') - - tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy') - return(tempdir) - - -def prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - if min_df is None: - min_df = 0.1 * len(included_subreddits) - - tfidf = tfidf.filter(f.col('count') >= min_df) - if max_df is not None: - tfidf = tfidf.filter(f.col('count') <= max_df) - - tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits)) - - # reset the subreddit ids - sub_ids = tfidf.select('subreddit_id').distinct() - sub_ids = sub_ids.withColumn("subreddit_id_new", f.row_number().over(Window.orderBy("subreddit_id"))) - tfidf = tfidf.join(sub_ids,'subreddit_id') - - # only use terms in at least min_df included subreddits - new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count')) - tfidf = tfidf.join(new_count,term_id,how='inner') - - # reset the term ids - term_ids = tfidf.select([term_id]).distinct() - term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.orderBy(term_id))) - tfidf = tfidf.join(term_ids,term_id) - - tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old") - tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float')) - - tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.') - - tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy') - return tempdir - - -# try computing cosine similarities using spark -def spark_cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - - if min_df is None: - min_df = 0.1 * len(included_subreddits) - - tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits)) - tfidf = tfidf.cache() - - # reset the subreddit ids - sub_ids = tfidf.select('subreddit_id').distinct() - sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.orderBy("subreddit_id"))) - tfidf = tfidf.join(sub_ids,'subreddit_id') - - # only use terms in at least min_df included subreddits - new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count')) - tfidf = tfidf.join(new_count,term_id,how='inner') - - # reset the term ids - term_ids = tfidf.select([term_id]).distinct() - term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.orderBy(term_id))) - tfidf = tfidf.join(term_ids,term_id) - - tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old") - tfidf = tfidf.withColumn("tf_idf", tfidf.relative_tf * tfidf.idf) - - # step 1 make an rdd of entires - # sorted by (dense) spark subreddit id - n_partitions = int(len(included_subreddits)*2 / 5) - - entries = tfidf.select(f.col(term_id_new)-1,f.col("subreddit_id_new")-1,"tf_idf").rdd.repartition(n_partitions) - - # put like 10 subredis in each partition - - # step 2 make it into a distributed.RowMatrix - coordMat = CoordinateMatrix(entries) - - coordMat = CoordinateMatrix(coordMat.entries.repartition(n_partitions)) - - # this needs to be an IndexedRowMatrix() - mat = coordMat.toRowMatrix() - - #goal: build a matrix of subreddit columns and tf-idfs rows - sim_dist = mat.columnSimilarities(threshold=similarity_threshold) - - return (sim_dist, tfidf) def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): @@ -382,7 +282,9 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig else: # tf_fam = tf_weight.Norm05 df = df.withColumn("tf_idf", (0.5 + 0.5 * df.relative_tf) * df.idf) - return df + df = df.repartition(400,'subreddit','week') + dfwriter = df.write.partitionBy("week").sortBy("subreddit") + return dfwriter def _calc_tfidf(df, term_colname, tf_family): term = term_colname @@ -393,7 +295,7 @@ def _calc_tfidf(df, term_colname, tf_family): df = df.join(max_subreddit_terms, on='subreddit') - df = df.withColumn("relative_tf", df.tf / df.sr_max_tf) + df = df.withColumn("relative_tf", (df.tf / df.sr_max_tf)) # group by term. term is unique idf = df.groupby([term]).count() @@ -436,8 +338,9 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf')) df = _calc_tfidf(df, term_colname, tf_family) - - return df + df = df.repartition('subreddit') + dfwriter = df.write.sortBy("subreddit","tf") + return dfwriter def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): rankdf = pd.read_csv(path) @@ -445,3 +348,18 @@ def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarit return included_subreddits +def repartition_tfidf(inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet", + outpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k_repartitioned.parquet"): + spark = SparkSession.builder.getOrCreate() + df = spark.read.parquet(inpath) + df = df.repartition(400,'subreddit') + df.write.parquet(outpath,mode='overwrite') + + +def repartition_tfidf_weekly(inpath="/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet", + outpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_repartitioned.parquet"): + spark = SparkSession.builder.getOrCreate() + df = spark.read.parquet(inpath) + df = df.repartition(400,'subreddit','week') + dfwriter = df.write.partitionBy("week") + dfwriter.parquet(outpath,mode='overwrite') diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 30033a8..002e89f 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -15,10 +15,9 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_ else: include_subs = select_topN_subreddits(topN) - df = func(df, include_subs, term_colname) - - df.write.parquet(outpath,mode='overwrite',compression='snappy') + dfwriter = func(df, include_subs, term_colname) + dfwriter.parquet(outpath,mode='overwrite',compression='snappy') spark.stop() def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits): diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index 044ee75..e24ceee 100644 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -3,78 +3,78 @@ from pyspark.sql import SparkSession from pyspark.sql import Window import numpy as np import pyarrow +import pyarrow.dataset as ds import pandas as pd import fire -from itertools import islice +from itertools import islice, chain from pathlib import Path from similarities_helper import * from multiprocessing import Pool, cpu_count +from functools import partial -def _week_similarities(tempdir, term_colname, week): - print(f"loading matrix: {week}") - mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week) - print('computing similarities') - sims = column_similarities(mat) - del mat - names = subreddit_names.loc[subreddit_names.week == week] - sims = pd.DataFrame(sims.todense()) +def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, included_subreddits, topN, outdir:Path): + term = term_colname + term_id = term + '_id' + term_id_new = term + '_id_new' + print(f"loading matrix: {week}") + entries, subreddit_names = reindex_tfidf(infile = tfidf_path, + term_colname=term_colname, + min_df=min_df, + max_df=max_df, + included_subreddits=included_subreddits, + topN=topN, + week=week) + mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new))) + print('computing similarities') + sims = column_similarities(mat) + del mat + sims = pd.DataFrame(sims.todense()) + sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) + sims['_subreddit'] = names.subreddit.values + outfile = str(Path(outdir) / str(week)) + write_weekly_similarities(outfile, sims, week, names) - sims = sims.rename({i: sr for i, sr in enumerate(names.subreddit.values)}, axis=1) - sims['_subreddit'] = names.subreddit.values - - write_weekly_similarities(outfile, sims, week, names) +def pull_weeks(batch): + return set(batch.to_pandas()['week']) #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet') -def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500): - spark = SparkSession.builder.getOrCreate() - conf = spark.sparkContext.getConf() +def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500): print(outfile) - tfidf = spark.read.parquet(tfidf_path) - - if included_subreddits is None: - included_subreddits = select_topN_subreddits(topN) - else: - included_subreddits = set(open(included_subreddits)) + tfidf_ds = ds.dataset(tfidf_path) + tfidf_ds = tfidf_ds.to_table(columns=["week"]) + batches = tfidf_ds.to_batches() - print(f"computing weekly similarities for {len(included_subreddits)} subreddits") + with Pool(cpu_count()) as pool: + weeks = set(chain( * pool.imap_unordered(pull_weeks,batches))) - print("creating temporary parquet with matrix indicies") - tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, max_df=None, included_subreddits=included_subreddits) - - tfidf = spark.read.parquet(tempdir.name) - - # the ids can change each week. - subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas() - subreddit_names = subreddit_names.sort_values("subreddit_id_new") - subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1 - spark.stop() - - weeks = sorted(list(subreddit_names.week.drop_duplicates())) + weeks = sorted(weeks) # do this step in parallel if we have the memory for it. # should be doable with pool.map - def week_similarities_helper(week): - _week_similarities(tempdir, term_colname, week) + print(f"computing weekly similarities") + week_similarities_helper = partial(_week_similarities,simfunc=column_similarities, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN) with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine? list(pool.map(week_similarities_helper,weeks)) -def author_cosine_similarities_weekly(outfile, min_df=2 , included_subreddits=None, topN=500): +def author_cosine_similarities_weekly(outfile, min_df=2, max_df=None, included_subreddits=None, topN=500): return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', outfile, 'author', min_df, + max_df, included_subreddits, topN) -def term_cosine_similarities_weekly(outfile, min_df=None, included_subreddits=None, topN=500): - return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', - outfile, - 'term', - min_df, - included_subreddits, - topN) +def term_cosine_similarities_weekly(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500): + return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', + outfile, + 'term', + min_df, + max_df, + included_subreddits, + topN) if __name__ == "__main__": fire.Fire({'authors':author_cosine_similarities_weekly, From e1c9d9af6fccf3f2de24d192f9678318ad04a4ea Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 3 May 2021 10:37:09 -0700 Subject: [PATCH 04/22] Remove 'exclude phrases' parameter. --- similarities/cosine_similarities.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index 0c9c986..8b85692 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -4,9 +4,9 @@ from pathlib import Path from similarities_helper import similarities, column_similarities from functools import partial -def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'): +def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): - return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) # change so that these take in an input as an optional argument (for speed, but also for idf). def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): @@ -18,7 +18,6 @@ def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subredd max_df, included_subreddits, topN, - exclude_phrases, from_date, to_date ) @@ -31,7 +30,6 @@ def author_cosine_similarities(outfile, min_df=2, max_df=None, included_subreddi max_df, included_subreddits, topN, - exclude_phrases=False, from_date=from_date, to_date=to_date ) @@ -44,7 +42,6 @@ def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=N max_df, included_subreddits, topN, - exclude_phrases=False, from_date=from_date, to_date=to_date, tfidf_colname='relative_tf' From 8d1df5b26ee80fee639e5b3ecd057fe8e72f166c Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 3 May 2021 11:28:48 -0700 Subject: [PATCH 05/22] refactor clustering.py into method-specific files. --- ...ect_affinity.py => affinity_clustering.py} | 78 +++++++++---------- clustering/clustering.py | 20 +---- clustering/hdbscan_clustering.py | 13 ++-- ...{select_kmeans.py => kmeans_clustering.py} | 33 +++++--- 4 files changed, 69 insertions(+), 75 deletions(-) rename clustering/{select_affinity.py => affinity_clustering.py} (63%) rename clustering/{select_kmeans.py => kmeans_clustering.py} (77%) diff --git a/clustering/select_affinity.py b/clustering/affinity_clustering.py similarity index 63% rename from clustering/select_affinity.py rename to clustering/affinity_clustering.py index b8bd13a..287f7e2 100644 --- a/clustering/select_affinity.py +++ b/clustering/affinity_clustering.py @@ -18,7 +18,44 @@ class affinity_clustering_result(clustering_result): convergence_iter:int preference_quantile:float -def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): +def affinity_clustering(similarities, output, *args, **kwargs): + subreddits, mat = read_similarity_mat(similarities) + clustering = _affinity_clustering(mat, *args, **kwargs) + cluster_data = process_clustering_result(clustering, subreddits) + cluster_data['algorithm'] = 'affinity' + return(cluster_data) + +def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): + ''' + similarities: matrix of similarity scores + preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits. + damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. + ''' + print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantile}") + + preference = np.quantile(mat,preference_quantile) + + print(f"preference is {preference}") + print("data loaded") + sys.stdout.flush() + clustering = AffinityPropagation(damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + copy=False, + preference=preference, + affinity='precomputed', + verbose=verbose, + random_state=random_state).fit(mat) + + cluster_data = process_clustering_result(clustering, subreddits) + output = Path(output) + output.parent.mkdir(parents=True,exist_ok=True) + cluster_data.to_feather(output) + print(f"saved {output}") + return clustering + + +def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): if name is None: name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" print(name) @@ -53,41 +90,6 @@ def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, return res -def do_affinity_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): - if name is None: - name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" - print(name) - sys.stdout.flush() - outpath = outdir / (str(name) + ".feather") - outpath.parent.mkdir(parents=True,exist_ok=True) - print(outpath) - clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose) - mat = sim_to_dist(clustering.affinity_matrix_) - - try: - score = silhouette_score(mat, clustering.labels_, metric='precomputed') - except ValueError: - score = None - - if alt_mat is not None: - alt_distances = sim_to_dist(alt_mat) - try: - alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') - except ValueError: - alt_score = None - - res = clustering_result(outpath=outpath, - damping=damping, - max_iter=max_iter, - convergence_iter=convergence_iter, - preference_quantile=preference_quantile, - silhouette_score=score, - alt_silhouette_score=score, - name=str(name)) - - return res - - # alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None): @@ -116,7 +118,7 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max hyper_grid = product(damping, convergence_iter, preference_quantile) hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid)) - _do_clustering = partial(do_affinity_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) + _do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) # similarities = Array('d', mat) # call pool.starmap @@ -124,8 +126,6 @@ def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max clustering_data = pool.starmap(_do_clustering, hyper_grid) clustering_data = pd.DataFrame(list(clustering_data)) clustering_data.to_csv(outinfo) - - return clustering_data if __name__ == "__main__": diff --git a/clustering/clustering.py b/clustering/clustering.py index 85be3fe..6ee7842 100755 --- a/clustering/clustering.py +++ b/clustering/clustering.py @@ -3,7 +3,7 @@ import sys import pandas as pd import numpy as np -from sklearn.cluster import AffinityPropagation, KMeans +from sklearn.cluster import AffinityPropagation import fire from pathlib import Path from multiprocessing import cpu_count @@ -46,24 +46,6 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, print(f"saved {output}") return clustering -def kmeans_clustering(similarities, *args, **kwargs): - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - clustering = _kmeans_clustering(mat, *args, **kwargs) - cluster_data = process_clustering_result(clustering, subreddits) - return(cluster_data) - -def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): - - clustering = KMeans(n_clusters=n_clusters, - n_init=n_init, - max_iter=max_iter, - random_state=random_state, - verbose=verbose - ).fit(mat) - - return clustering - if __name__ == "__main__": diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index 888554a..4f4e0d6 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -28,6 +28,13 @@ def test_select_hdbscan_clustering(): cluster_selection_methods=['eom']; lsi_dimensions='all' + df = pd.read_csv("test_hdbscan/selection_data.csv") + test_select_hdbscan_clustering() + check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + + @dataclass class hdbscan_clustering_result(clustering_result): min_cluster_size:int @@ -165,8 +172,4 @@ def make_KNN_plots(): KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') if __name__ == "__main__": - df = pd.read_csv("test_hdbscan/selection_data.csv") - test_select_hdbscan_clustering() - check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") - silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") - c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + fire.Fire(select_hdbscan_clustering) diff --git a/clustering/select_kmeans.py b/clustering/kmeans_clustering.py similarity index 77% rename from clustering/select_kmeans.py rename to clustering/kmeans_clustering.py index b07a108..8822e9f 100644 --- a/clustering/select_kmeans.py +++ b/clustering/kmeans_clustering.py @@ -1,23 +1,32 @@ -from sklearn.metrics import silhouette_score -from sklearn.cluster import AffinityPropagation -from functools import partial -from clustering import _kmeans_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result -from dataclasses import dataclass -from multiprocessing import Pool, cpu_count, Array, Process -from pathlib import Path -from itertools import product, starmap -import numpy as np -import pandas as pd +from sklearn.cluster import KMeans import fire -import sys +from pathlib import Path +from multiprocessing import cpu_count +from dataclasses import dataclass +from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat @dataclass class kmeans_clustering_result(clustering_result): n_clusters:int n_init:int +def kmeans_clustering(similarities, *args, **kwargs): + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + clustering = _kmeans_clustering(mat, *args, **kwargs) + cluster_data = process_clustering_result(clustering, subreddits) + return(cluster_data) -# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. +def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): + + clustering = KMeans(n_clusters=n_clusters, + n_init=n_init, + max_iter=max_iter, + random_state=random_state, + verbose=verbose + ).fit(mat) + + return clustering def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): if name is None: From f05cb962e0388feaf38aaf84f222696ab8f5f398 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Fri, 7 May 2021 22:33:26 -0700 Subject: [PATCH 06/22] refactor clustring in object oriented style --- clustering/affinity_clustering.py | 245 +++++++++++++---------- clustering/clustering_base.py | 149 ++++++++++++-- clustering/hdbscan_clustering.py | 319 +++++++++++++++++++++--------- clustering/kmeans_clustering.py | 185 ++++++++++------- 4 files changed, 612 insertions(+), 286 deletions(-) diff --git a/clustering/affinity_clustering.py b/clustering/affinity_clustering.py index 287f7e2..b4f8461 100644 --- a/clustering/affinity_clustering.py +++ b/clustering/affinity_clustering.py @@ -2,7 +2,8 @@ from sklearn.metrics import silhouette_score from sklearn.cluster import AffinityPropagation from functools import partial from dataclasses import dataclass -from clustering import _affinity_clustering, read_similarity_mat, sim_to_dist, process_clustering_result, clustering_result +from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep from multiprocessing import Pool, cpu_count, Array, Process from pathlib import Path from itertools import product, starmap @@ -17,116 +18,158 @@ class affinity_clustering_result(clustering_result): damping:float convergence_iter:int preference_quantile:float + preference:float + max_iter:int -def affinity_clustering(similarities, output, *args, **kwargs): - subreddits, mat = read_similarity_mat(similarities) - clustering = _affinity_clustering(mat, *args, **kwargs) - cluster_data = process_clustering_result(clustering, subreddits) - cluster_data['algorithm'] = 'affinity' - return(cluster_data) +@dataclass +class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin): + pass -def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): - ''' - similarities: matrix of similarity scores - preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits. - damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. - ''' - print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantile}") +class affinity_job(clustering_job): + def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): + super().__init__(infile, + outpath, + name, + call=self._affinity_clustering, + preference_quantile=preference_quantile, + damping=damping, + max_iter=max_iter, + convergence_iter=convergence_iter, + random_state=1968, + verbose=verbose) + self.damping=damping + self.max_iter=max_iter + self.convergence_iter=convergence_iter + self.preference_quantile=preference_quantile - preference = np.quantile(mat,preference_quantile) + def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs): + mat = 1-mat + preference = np.quantile(mat, preference_quantile) + self.preference = preference + print(f"preference is {preference}") + print("data loaded") + sys.stdout.flush() + clustering = AffinityPropagation(*args, + preference=preference, + affinity='precomputed', + copy=False, + **kwargs).fit(mat) + return clustering - print(f"preference is {preference}") - print("data loaded") - sys.stdout.flush() - clustering = AffinityPropagation(damping=damping, - max_iter=max_iter, - convergence_iter=convergence_iter, - copy=False, - preference=preference, - affinity='precomputed', - verbose=verbose, - random_state=random_state).fit(mat) + def get_info(self): + result = super().get_info() + self.result=affinity_clustering_result(**result.__dict__, + damping=self.damping, + max_iter=self.max_iter, + convergence_iter=self.convergence_iter, + preference_quantile=self.preference_quantile, + preference=self.preference) - cluster_data = process_clustering_result(clustering, subreddits) - output = Path(output) - output.parent.mkdir(parents=True,exist_ok=True) - cluster_data.to_feather(output) - print(f"saved {output}") - return clustering + return self.result +class affinity_lsi_job(affinity_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__(infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) -def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): - if name is None: - name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" - print(name) - sys.stdout.flush() - outpath = outdir / (str(name) + ".feather") - outpath.parent.mkdir(parents=True,exist_ok=True) - print(outpath) - clustering = _affinity_clustering(mat, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose) - cluster_data = process_clustering_result(clustering, subreddits) - mat = sim_to_dist(clustering.affinity_matrix_) + def get_info(self): + result = super().get_info() + self.result = affinity_clustering_result_lsi(**result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result - try: - score = silhouette_score(mat, clustering.labels_, metric='precomputed') - except ValueError: - score = None +class affinity_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + *args, + **kwargs): - if alt_mat is not None: - alt_distances = sim_to_dist(alt_mat) - try: - alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') - except ValueError: - alt_score = None + super().__init__(affinity_job, + _afffinity_grid_sweep, + inpath, + outpath, + self.namer, + *args, + **kwargs) + def namer(self, + damping, + max_iter, + convergence_iter, + preference_quantile): + + return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" + +class _affinity_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + self.lsi_dim = lsi_dim + self.jobtype = affinity_lsi_job + super().__init__(self.jobtype, + inpath, + outpath, + self.namer, + self.lsi_dim, + *args, + **kwargs) + + def namer(self, *args, **kwargs): + s = affinity_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +class affinity_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + dampings=[0.9], + max_iters=[10000], + convergence_iters=[30], + preference_quantiles=[0.5]): + + super().__init__(affinity_lsi_job, + _affinity_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + dampings, + max_iters, + convergence_iters, + preference_quantiles) - res = affinity_clustering_result(outpath=outpath, - damping=damping, - max_iter=max_iter, - convergence_iter=convergence_iter, - preference_quantile=preference_quantile, - silhouette_score=score, - alt_silhouette_score=score, - name=str(name)) + + +def test_select_affinity_clustering(): + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" + outpath = "test_affinity"; + dampings=[0.8,0.9] + max_iters=[100000] + convergence_iters=[15] + preference_quantiles=[0.5,0.7] + + gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles) + gs.run(20) + gs.save("test_affinity/lsi_sweep.csv") - return res - -# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). - -def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None): - - damping = list(map(float,damping)) - convergence_iter = convergence_iter = list(map(int,convergence_iter)) - preference_quantile = list(map(float,preference_quantile)) - - if type(outdir) is str: - outdir = Path(outdir) - - outdir.mkdir(parents=True,exist_ok=True) - - subreddits, mat = read_similarity_mat(similarities,use_threads=True) - - if alt_similarities is not None: - alt_mat = read_similarity_mat(alt_similarities,use_threads=True) - else: - alt_mat = None - - if J is None: - J = cpu_count() - pool = Pool(J) - - # get list of tuples: the combinations of hyperparameters - hyper_grid = product(damping, convergence_iter, preference_quantile) - hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid)) - - _do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) - - # similarities = Array('d', mat) - # call pool.starmap - print("running clustering selection") - clustering_data = pool.starmap(_do_clustering, hyper_grid) - clustering_data = pd.DataFrame(list(clustering_data)) - clustering_data.to_csv(outinfo) - return clustering_data if __name__ == "__main__": - x = fire.Fire(select_affinity_clustering) + fire.Fire{'grid_sweep':affinity_grid_sweep, + 'grid_sweep_lsi':affinity_lsi_grid_sweep + 'cluster':affinity_job, + 'cluster_lsi':affinity_lsi_job} diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py index 1d86438..5492415 100644 --- a/clustering/clustering_base.py +++ b/clustering/clustering_base.py @@ -2,6 +2,9 @@ from pathlib import Path import numpy as np import pandas as pd from dataclasses import dataclass +from sklearn.metrics import silhouette_score, silhouette_samples +from itertools import product, chain +from multiprocessing import Pool, cpu_count def sim_to_dist(mat): dist = 1-mat @@ -9,41 +12,147 @@ def sim_to_dist(mat): np.fill_diagonal(dist,0) return dist -def process_clustering_result(clustering, subreddits): +class grid_sweep: + def __init__(self, jobtype, inpath, outpath, namer, *args): + self.jobtype = jobtype + self.namer = namer + grid = list(product(*args)) + inpath = Path(inpath) + outpath = Path(outpath) + self.hasrun = False + self.grid = [(inpath,outpath,namer(*g)) + g for g in grid] + self.jobs = [jobtype(*g) for g in self.grid] - if hasattr(clustering,'n_iter_'): - print(f"clustering took {clustering.n_iter_} iterations") + def run(self, cores=20): + if cores is not None and cores > 1: + with Pool(cores) as pool: + infos = pool.map(self.jobtype.get_info, self.jobs) + else: + infos = map(self.jobtype.get_info, self.jobs) - clusters = clustering.labels_ + self.infos = pd.DataFrame(infos) + self.hasrun = True - print(f"found {len(set(clusters))} clusters") + def save(self, outcsv): + if not self.hasrun: + self.run() + outcsv = Path(outcsv) + outcsv.parent.mkdir(parents=True, exist_ok=True) + self.infos.to_csv(outcsv) - cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) - cluster_sizes = cluster_data.groupby("cluster").count().reset_index() - print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members") +class lsi_grid_sweep(grid_sweep): + def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs): + self.jobtype = jobtype + self.subsweep = subsweep + inpath = Path(inpath) + if lsi_dimensions == 'all': + lsi_paths = list(inpath.glob("*")) + else: + lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] - print(f"the median cluster has {cluster_sizes.subreddit.median()} members") + lsi_nums = [p.stem for p in lsi_paths] + self.hasrun = False + self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] + self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) - print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member") - print(f"{(cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']])} subreddits are in cluster -1",flush=True) +# this is meant to be an interface, not created directly +class clustering_job: + def __init__(self, infile, outpath, name, call, *args, **kwargs): + self.outpath = Path(outpath) + self.call = call + self.args = args + self.kwargs = kwargs + self.infile = Path(infile) + self.name = name + self.hasrun = False - return cluster_data + def run(self): + self.subreddits, self.mat = self.read_distance_mat(self.infile) + self.clustering = self.call(self.mat, *self.args, **self.kwargs) + self.cluster_data = self.process_clustering(self.clustering, self.subreddits) + self.score = self.silhouette() + self.outpath.mkdir(parents=True, exist_ok=True) + self.cluster_data.to_feather(self.outpath/(self.name + ".feather")) + self.hasrun = True + + def get_info(self): + if not self.hasrun: + self.run() + self.result = clustering_result(outpath=str(self.outpath.resolve()), + silhouette_score=self.score, + name=self.name, + n_clusters=self.n_clusters, + n_isolates=self.n_isolates, + silhouette_samples = str(self.silsampout.resolve()) + ) + return self.result + + def silhouette(self): + isolates = self.clustering.labels_ == -1 + scoremat = self.mat[~isolates][:,~isolates] + score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') + silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') + silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) + self.outpath.mkdir(parents=True, exist_ok=True) + self.silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather") + silhouette_samp.to_feather(self.silsampout) + return score + + def read_distance_mat(self, similarities, use_threads=True): + df = pd.read_feather(similarities, use_threads=use_threads) + mat = np.array(df.drop('_subreddit',1)) + n = mat.shape[0] + mat[range(n),range(n)] = 1 + return (df._subreddit,1-mat) + + def process_clustering(self, clustering, subreddits): + + if hasattr(clustering,'n_iter_'): + print(f"clustering took {clustering.n_iter_} iterations") + + clusters = clustering.labels_ + self.n_clusters = len(set(clusters)) + + print(f"found {self.n_clusters} clusters") + + cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) + + cluster_sizes = cluster_data.groupby("cluster").count().reset_index() + print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members") + + print(f"the median cluster has {cluster_sizes.subreddit.median()} members") + n_isolates1 = (cluster_sizes.subreddit==1).sum() + + print(f"{n_isolates1} clusters have 1 member") + + n_isolates2 = (cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']]) + + print(f"{n_isolates2} subreddits are in cluster -1",flush=True) + + if n_isolates1 == 0: + self.n_isolates = n_isolates2 + else: + self.n_isolates = n_isolates1 + + return cluster_data + + +class lsi_mixin(): + def set_lsi_dims(self, lsi_dims): + self.lsi_dims = lsi_dims @dataclass class clustering_result: outpath:Path - max_iter:int silhouette_score:float - alt_silhouette_score:float name:str n_clusters:int + n_isolates:int + silhouette_samples:str -def read_similarity_mat(similarities, use_threads=True): - df = pd.read_feather(similarities, use_threads=use_threads) - mat = np.array(df.drop('_subreddit',1)) - n = mat.shape[0] - mat[range(n),range(n)] = 1 - return (df._subreddit,mat) +@dataclass +class lsi_result_mixin: + lsi_dimensions:int diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index 4f4e0d6..f0ee703 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -1,10 +1,11 @@ from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep from dataclasses import dataclass import hdbscan from sklearn.neighbors import NearestNeighbors import plotnine as pn import numpy as np -from itertools import product, starmap +from itertools import product, starmap, chain import pandas as pd from sklearn.metrics import silhouette_score, silhouette_samples from pathlib import Path @@ -13,27 +14,88 @@ import fire from pyarrow.feather import write_feather def test_select_hdbscan_clustering(): - select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", - "test_hdbscan_author30k", - min_cluster_sizes=[2], - min_samples=[1,2], - cluster_selection_epsilons=[0,0.05,0.1,0.15], - cluster_selection_methods=['eom','leaf'], - lsi_dimensions='all') - inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI" + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" outpath = "test_hdbscan"; min_cluster_sizes=[2,3,4]; min_samples=[1,2,3]; cluster_selection_epsilons=[0,0.1,0.3,0.5]; cluster_selection_methods=['eom']; lsi_dimensions='all' + gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods) + gs.run(20) + gs.save("test_hdbscan/lsi_sweep.csv") + # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom') + # job1.run() + # print(job1.get_info()) - df = pd.read_csv("test_hdbscan/selection_data.csv") - test_select_hdbscan_clustering() - check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") - silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") - c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) + # df = pd.read_csv("test_hdbscan/selection_data.csv") + # test_select_hdbscan_clustering() + # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) +class hdbscan_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods + ): + + super().__init__(hdbscan_lsi_job, + _hdbscan_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods) + +class hdbscan_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + *args, + **kwargs): + + super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs) + + def namer(self, + min_cluster_size, + min_samples, + cluster_selection_epsilon, + cluster_selection_method): + return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" + + +class _hdbscan_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + + self.lsi_dim = lsi_dim + self.jobtype = hdbscan_lsi_job + super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + + + def namer(self, *args, **kwargs): + s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s @dataclass class hdbscan_clustering_result(clustering_result): @@ -41,107 +103,166 @@ class hdbscan_clustering_result(clustering_result): min_samples:int cluster_selection_epsilon:float cluster_selection_method:str - lsi_dimensions:int - n_isolates:int - silhouette_samples:str -def select_hdbscan_clustering(inpath, - outpath, - outfile=None, - min_cluster_sizes=[2], - min_samples=[1], - cluster_selection_epsilons=[0], - cluster_selection_methods=['eom'], - lsi_dimensions='all' - ): +@dataclass +class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): + pass - inpath = Path(inpath) - outpath = Path(outpath) - outpath.mkdir(exist_ok=True, parents=True) +class hdbscan_job(clustering_job): + def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): + super().__init__(infile, + outpath, + name, + call=hdbscan_job._hdbscan_clustering, + min_cluster_size=min_cluster_size, + min_samples=min_samples, + cluster_selection_epsilon=cluster_selection_epsilon, + cluster_selection_method=cluster_selection_method + ) + + self.min_cluster_size = min_cluster_size + self.min_samples = min_samples + self.cluster_selection_epsilon = cluster_selection_epsilon + self.cluster_selection_method = cluster_selection_method +# self.mat = 1 - self.mat + + def _hdbscan_clustering(mat, *args, **kwargs): + print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") + print(mat) + clusterer = hdbscan.HDBSCAN(metric='precomputed', + core_dist_n_jobs=cpu_count(), + *args, + **kwargs, + ) - if lsi_dimensions == 'all': - lsi_paths = list(inpath.glob("*")) + clustering = clusterer.fit(mat.astype('double')) + + return(clustering) - else: - lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] + def get_info(self): + result = super().get_info() + self.result = hdbscan_clustering_result(**result.__dict__, + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples, + cluster_selection_epsilon=self.cluster_selection_epsilon, + cluster_selection_method=self.cluster_selection_method) + return self.result - lsi_nums = [p.stem for p in lsi_paths] - grid = list(product(lsi_nums, - min_cluster_sizes, - min_samples, - cluster_selection_epsilons, - cluster_selection_methods)) +class hdbscan_lsi_job(hdbscan_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__( + infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) - # fix the output file names - names = list(map(lambda t:'_'.join(map(str,t)),grid)) + def get_info(self): + partial_result = super().get_info() + self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result - grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] +# def select_hdbscan_clustering(inpath, +# outpath, +# outfile=None, +# min_cluster_sizes=[2], +# min_samples=[1], +# cluster_selection_epsilons=[0], +# cluster_selection_methods=['eom'], +# lsi_dimensions='all' +# ): + +# inpath = Path(inpath) +# outpath = Path(outpath) +# outpath.mkdir(exist_ok=True, parents=True) + +# if lsi_dimensions is None: +# lsi_paths = [inpath] +# elif lsi_dimensions == 'all': +# lsi_paths = list(inpath.glob("*")) + +# else: +# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] + +# if lsi_dimensions is not None: +# lsi_nums = [p.stem for p in lsi_paths] +# else: +# lsi_nums = [None] +# grid = list(product(lsi_nums, +# min_cluster_sizes, +# min_samples, +# cluster_selection_epsilons, +# cluster_selection_methods)) + +# # fix the output file names +# names = list(map(lambda t:'_'.join(map(str,t)),grid)) + +# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] - with Pool(int(cpu_count()/4)) as pool: - mods = starmap(hdbscan_clustering, grid) +# with Pool(int(cpu_count()/4)) as pool: +# mods = starmap(hdbscan_clustering, grid) - res = pd.DataFrame(mods) - if outfile is None: - outfile = outpath / "selection_data.csv" +# res = pd.DataFrame(mods) +# if outfile is None: +# outfile = outpath / "selection_data.csv" - res.to_csv(outfile) +# res.to_csv(outfile) -def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - clustering = _hdbscan_clustering(mat, - min_cluster_size=min_cluster_size, - min_samples=min_samples, - cluster_selection_epsilon=cluster_selection_epsilon, - cluster_selection_method=cluster_selection_method, - metric='precomputed', - core_dist_n_jobs=cpu_count() - ) +# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): +# subreddits, mat = read_similarity_mat(similarities) +# mat = sim_to_dist(mat) +# clustering = _hdbscan_clustering(mat, +# min_cluster_size=min_cluster_size, +# min_samples=min_samples, +# cluster_selection_epsilon=cluster_selection_epsilon, +# cluster_selection_method=cluster_selection_method, +# metric='precomputed', +# core_dist_n_jobs=cpu_count() +# ) - cluster_data = process_clustering_result(clustering, subreddits) - isolates = clustering.labels_ == -1 - scoremat = mat[~isolates][:,~isolates] - score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') - cluster_data.to_feather(output) +# cluster_data = process_clustering_result(clustering, subreddits) +# isolates = clustering.labels_ == -1 +# scoremat = mat[~isolates][:,~isolates] +# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') +# cluster_data.to_feather(output) +# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') +# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) +# silsampout = output.parent / ("silhouette_samples" + output.name) +# silhouette_samp.to_feather(silsampout) - silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') - silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) - silsampout = output.parent / ("silhouette_samples" + output.name) - silhouette_samp.to_feather(silsampout) - - result = hdbscan_clustering_result(outpath=output, - max_iter=None, - silhouette_samples=silsampout, - silhouette_score=score, - alt_silhouette_score=score, - name=name, - min_cluster_size=min_cluster_size, - min_samples=min_samples, - cluster_selection_epsilon=cluster_selection_epsilon, - cluster_selection_method=cluster_selection_method, - lsi_dimensions=lsi_dim, - n_isolates=isolates.sum(), - n_clusters=len(set(clustering.labels_)) - ) +# result = hdbscan_clustering_result(outpath=output, +# silhouette_samples=silsampout, +# silhouette_score=score, +# name=name, +# min_cluster_size=min_cluster_size, +# min_samples=min_samples, +# cluster_selection_epsilon=cluster_selection_epsilon, +# cluster_selection_method=cluster_selection_method, +# lsi_dimensions=lsi_dim, +# n_isolates=isolates.sum(), +# n_clusters=len(set(clustering.labels_)) +# ) - return(result) +# return(result) -# for all runs we should try cluster_selection_epsilon = None -# for terms we should try cluster_selection_epsilon around 0.56-0.66 -# for authors we should try cluster_selection_epsilon around 0.98-0.99 -def _hdbscan_clustering(mat, *args, **kwargs): - print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") +# # for all runs we should try cluster_selection_epsilon = None +# # for terms we should try cluster_selection_epsilon around 0.56-0.66 +# # for authors we should try cluster_selection_epsilon around 0.98-0.99 +# def _hdbscan_clustering(mat, *args, **kwargs): +# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") - print(mat) - clusterer = hdbscan.HDBSCAN(*args, - **kwargs, - ) +# print(mat) +# clusterer = hdbscan.HDBSCAN(*args, +# **kwargs, +# ) - clustering = clusterer.fit(mat.astype('double')) +# clustering = clusterer.fit(mat.astype('double')) - return(clustering) +# return(clustering) def KNN_distances_plot(mat,outname,k=2): nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) @@ -172,4 +293,10 @@ def make_KNN_plots(): KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') if __name__ == "__main__": - fire.Fire(select_hdbscan_clustering) + fire.Fire{'grid_sweep':hdbscan_grid_sweep, + 'grid_sweep_lsi':hdbscan_lsi_grid_sweep + 'cluster':hdbscan_job, + 'cluster_lsi':hdbscan_lsi_job} + +# test_select_hdbscan_clustering() + #fire.Fire(select_hdbscan_clustering) diff --git a/clustering/kmeans_clustering.py b/clustering/kmeans_clustering.py index 8822e9f..e41b88b 100644 --- a/clustering/kmeans_clustering.py +++ b/clustering/kmeans_clustering.py @@ -4,98 +4,145 @@ from pathlib import Path from multiprocessing import cpu_count from dataclasses import dataclass from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat +from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep + @dataclass class kmeans_clustering_result(clustering_result): n_clusters:int n_init:int + max_iter:int -def kmeans_clustering(similarities, *args, **kwargs): - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - clustering = _kmeans_clustering(mat, *args, **kwargs) - cluster_data = process_clustering_result(clustering, subreddits) - return(cluster_data) +@dataclass +class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin): + pass -def _kmeans_clustering(mat, output, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): +class kmeans_job(clustering_job): + def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): + super().__init__(infile, + outpath, + name, + call=kmeans_job._kmeans_clustering, + n_clusters=n_clusters, + n_init=n_init, + max_iter=max_iter, + random_state=random_state, + verbose=verbose) - clustering = KMeans(n_clusters=n_clusters, - n_init=n_init, - max_iter=max_iter, - random_state=random_state, - verbose=verbose - ).fit(mat) + self.n_clusters=n_clusters + self.n_init=n_init + self.max_iter=max_iter - return clustering + def _kmeans_clustering(mat, *args, **kwargs): -def do_clustering(n_clusters, n_init, name, mat, subreddits, max_iter, outdir:Path, random_state, verbose, alt_mat, overwrite=False): - if name is None: - name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}" - print(name) - sys.stdout.flush() - outpath = outdir / (str(name) + ".feather") - print(outpath) - mat = sim_to_dist(mat) - clustering = _kmeans_clustering(mat, outpath, n_clusters, n_init, max_iter, random_state, verbose) + clustering = KMeans(*args, + **kwargs, + ).fit(mat) - outpath.parent.mkdir(parents=True,exist_ok=True) - cluster_data.to_feather(outpath) - cluster_data = process_clustering_result(clustering, subreddits) + return clustering - try: - score = silhouette_score(mat, clustering.labels_, metric='precomputed') - except ValueError: - score = None - if alt_mat is not None: - alt_distances = sim_to_dist(alt_mat) - try: - alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed') - except ValueError: - alt_score = None + def get_info(self): + result = super().get_info() + self.result = kmeans_clustering_result(**result.__dict__, + n_init=n_init, + max_iter=max_iter) + return self.result + + +class kmeans_lsi_job(kmeans_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__(infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + result = super().get_info() + self.result = kmeans_clustering_result_lsi(**result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result - res = kmeans_clustering_result(outpath=outpath, - max_iter=max_iter, - n_clusters=n_clusters, - n_init = n_init, - silhouette_score=score, - alt_silhouette_score=score, - name=str(name)) - return res +class kmeans_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + *args, + **kwargs): + super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs) + def namer(self, + n_clusters, + n_init, + max_iter): + return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}" -# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering). -def select_kmeans_clustering(similarities, outdir, outinfo, n_clusters=[1000], max_iter=100000, n_init=10, random_state=1968, verbose=True, alt_similarities=None): +class _kmeans_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + self.lsi_dim = lsi_dim + self.jobtype = kmeans_lsi_job + super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) - n_clusters = list(map(int,n_clusters)) - n_init = list(map(int,n_init)) + def namer(self, *args, **kwargs): + s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s - if type(outdir) is str: - outdir = Path(outdir) +class kmeans_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + n_clusters, + n_inits, + max_iters + ): - outdir.mkdir(parents=True,exist_ok=True) + super().__init__(kmeans_lsi_job, + _kmeans_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + n_clusters, + n_inits, + max_iters) - subreddits, mat = read_similarity_mat(similarities,use_threads=True) +def test_select_kmeans_clustering(): + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" + outpath = "test_kmeans"; + n_clusters=[200,300,400]; + n_init=[1,2,3]; + max_iter=[100000] - if alt_similarities is not None: - alt_mat = read_similarity_mat(alt_similarities,use_threads=True) - else: - alt_mat = None + gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter) + gs.run(1) - # get list of tuples: the combinations of hyperparameters - hyper_grid = product(n_clusters, n_init) - hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid)) + cluster_selection_epsilons=[0,0.1,0.3,0.5]; + cluster_selection_methods=['eom']; + lsi_dimensions='all' + gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods) + gs.run(20) + gs.save("test_hdbscan/lsi_sweep.csv") - _do_clustering = partial(do_clustering, mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat) - - # call starmap - print("running clustering selection") - clustering_data = starmap(_do_clustering, hyper_grid) - clustering_data = pd.DataFrame(list(clustering_data)) - clustering_data.to_csv(outinfo) - - return clustering_data if __name__ == "__main__": - x = fire.Fire(select_kmeans_clustering) + + fire.Fire{'grid_sweep':kmeans_grid_sweep, + 'grid_sweep_lsi':kmeans_lsi_grid_sweep + 'cluster':kmeans_job, + 'cluster_lsi':kmeans_lsi_job} From 4cb7eeec80c5a9c8f49339acd378c515e290ed81 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 10 May 2021 13:46:49 -0700 Subject: [PATCH 07/22] Refactor to make a decent api. --- clustering/Makefile | 171 +++++++++++++++++++---- clustering/affinity_clustering.py | 98 ++++--------- clustering/affinity_clustering_lsi.py | 99 +++++++++++++ clustering/clustering_base.py | 81 ++--------- clustering/grid_sweep.py | 32 +++++ clustering/hdbscan_clustering.py | 193 ++++---------------------- clustering/hdbscan_clustering_lsi.py | 101 ++++++++++++++ clustering/kmeans_clustering.py | 103 ++++---------- clustering/kmeans_clustering_lsi.py | 93 +++++++++++++ clustering/lsi_base.py | 28 ++++ 10 files changed, 591 insertions(+), 408 deletions(-) create mode 100644 clustering/affinity_clustering_lsi.py create mode 100644 clustering/grid_sweep.py create mode 100644 clustering/hdbscan_clustering_lsi.py create mode 100644 clustering/kmeans_clustering_lsi.py create mode 100644 clustering/lsi_base.py diff --git a/clustering/Makefile b/clustering/Makefile index d09cfd9..7e8cf39 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -2,41 +2,160 @@ srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering -kmeans_selection_grid="--max_iter=3000 --n_init=[10] --n_clusters=[100,500,1000,1500,2000,2500,3000,2350,3500,3570,4000]" -#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" -all:$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv -# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS -# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS +kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" +hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" +affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]" -$(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py - $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/kmeans $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv $(kmeans_selection_grid) +authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather +authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI +authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k +authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI -$(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py - $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/kmeans $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv $(kmeans_selection_grid) +authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather +authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI +authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k +authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI -$(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather - $(srun_singularity) python3 selection.py kmeans $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv $(kmeans_selection_grid) +terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather +terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI +terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k +terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI + +all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi + +terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv + +authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv + +authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv + +terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv + +authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv + +authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv + +${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py + $(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) + +${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py + $(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) + +${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py + $(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) + +${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py + $(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) + +${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py + $(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) + +${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py + $(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) + +${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py + $(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) + +${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py + $(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) + +${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py + $(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) -affinity_selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]" -$(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py - $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k/affinity $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 +## LSI Models +${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py + $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) -$(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py - $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k/affinity $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 +${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py + $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) -$(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather - $(srun_singularity) python3 selection.py affinity $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k/affinity $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv $(affinity_selection_grid) -J 20 +${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py + $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) -clean: - rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/affinity/selection_data.csv - rm -f $(clustering_data)/subreddit_comment_authors_10k/affinity/selection_data.csv - rm -f $(clustering_data)/subreddit_comment_terms_10k/affinity/selection_data.csv - rm -f $(clustering_data)/subreddit_comment_authors-tf_10k/kmeans/selection_data.csv - rm -f $(clustering_data)/subreddit_comment_authors_10k/kmeans/selection_data.csv - rm -f $(clustering_data)/subreddit_comment_terms_10k/kmeans/selection_data.csv +${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py + $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) -PHONY: clean +${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py + $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) + +${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py + $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) + +${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py + $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) + +${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py + $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) + +${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py + $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) + + + +clean_affinity: + rm -f ${authors_10k_output}/affinity/selection_data.csv + rm -f ${authors_tf_10k_output}/affinity/selection_data.csv + rm -f ${terms_10k_output}/affinity/selection_data.csv + +clean_kmeans: + rm -f ${authors_10k_output}/kmeans/selection_data.csv + rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv + rm -f ${terms_10k_output}/kmeans/selection_data.csv + +clean_hdbscan: + rm -f ${authors_10k_output}/hdbscan/selection_data.csv + rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv + rm -f ${terms_10k_output}/hdbscan/selection_data.csv + +clean_authors: + rm -f ${authors_10k_output}/affinity/selection_data.csv + rm -f ${authors_10k_output}/kmeans/selection_data.csv + rm -f ${authors_10k_output}/hdbscan/selection_data.csv + +clean_authors_tf: + rm -f ${authors_tf_10k_output}/affinity/selection_data.csv + rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv + rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv + +clean_terms: + rm -f ${terms_10k_output}/affinity/selection_data.csv + rm -f ${terms_10k_output}/kmeans/selection_data.csv + rm -f ${terms_10k_output}/hdbscan/selection_data.csv + +clean_lsi_affinity: + rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv + rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv + rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv + +clean_lsi_kmeans: + rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv + rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv + rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv + +clean_lsi_hdbscan: + rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv + rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv + rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv + +clean_lsi_authors: + rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv + rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv + rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv + +clean_lsi_authors_tf: + rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv + rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv + rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv + +clean_lsi_terms: + rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv + rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv + rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv + +clean: clean_affinity clean_kmeans clean_hdbscan + +PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS diff --git a/clustering/affinity_clustering.py b/clustering/affinity_clustering.py index b4f8461..d10628a 100644 --- a/clustering/affinity_clustering.py +++ b/clustering/affinity_clustering.py @@ -1,16 +1,12 @@ -from sklearn.metrics import silhouette_score from sklearn.cluster import AffinityPropagation -from functools import partial from dataclasses import dataclass -from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat -from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep -from multiprocessing import Pool, cpu_count, Array, Process +from clustering_base import clustering_result, clustering_job +from grid_sweep import grid_sweep from pathlib import Path from itertools import product, starmap -import numpy as np -import pandas as pd import fire import sys +import numpy as np # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. @dataclass @@ -21,10 +17,6 @@ class affinity_clustering_result(clustering_result): preference:float max_iter:int -@dataclass -class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin): - pass - class affinity_job(clustering_job): def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True): super().__init__(infile, @@ -67,21 +59,6 @@ class affinity_job(clustering_job): return self.result -class affinity_lsi_job(affinity_job, lsi_mixin): - def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): - super().__init__(infile, - outpath, - name, - *args, - **kwargs) - super().set_lsi_dims(lsi_dims) - - def get_info(self): - result = super().get_info() - self.result = affinity_clustering_result_lsi(**result.__dict__, - lsi_dimensions=self.lsi_dims) - return self.result - class affinity_grid_sweep(grid_sweep): def __init__(self, inpath, @@ -104,49 +81,29 @@ class affinity_grid_sweep(grid_sweep): return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" -class _affinity_lsi_grid_sweep(grid_sweep): - def __init__(self, - inpath, - outpath, - lsi_dim, - *args, - **kwargs): - self.lsi_dim = lsi_dim - self.jobtype = affinity_lsi_job - super().__init__(self.jobtype, - inpath, - outpath, - self.namer, - self.lsi_dim, - *args, - **kwargs) - - def namer(self, *args, **kwargs): - s = affinity_grid_sweep.namer(self, *args[1:], **kwargs) - s += f"_lsi-{self.lsi_dim}" - return s - -class affinity_lsi_grid_sweep(lsi_grid_sweep): - def __init__(self, - inpath, - lsi_dims, - outpath, - dampings=[0.9], - max_iters=[10000], - convergence_iters=[30], - preference_quantiles=[0.5]): - - super().__init__(affinity_lsi_job, - _affinity_lsi_grid_sweep, - inpath, - lsi_dims, - outpath, - dampings, - max_iters, - convergence_iters, - preference_quantiles) +def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]): + """Run affinity clustering once or more with different parameters. - + Usage: + affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters= --dampings= --preference_quantiles= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. + preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter. + convergence_iters:one or more integers of number of iterations without improvement before stopping. + max_iters: one or more numbers of different maximum interations. + """ + obj = affinity_grid_sweep(inpath, + outpath, + map(float,dampings), + map(int,max_iters), + map(int,convergence_iters), + map(float,preference_quantiles)) + obj.run(1) + obj.save(savefile) def test_select_affinity_clustering(): # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", @@ -169,7 +126,4 @@ def test_select_affinity_clustering(): if __name__ == "__main__": - fire.Fire{'grid_sweep':affinity_grid_sweep, - 'grid_sweep_lsi':affinity_lsi_grid_sweep - 'cluster':affinity_job, - 'cluster_lsi':affinity_lsi_job} + fire.Fire(run_affinity_grid_sweep) diff --git a/clustering/affinity_clustering_lsi.py b/clustering/affinity_clustering_lsi.py new file mode 100644 index 0000000..f5c794e --- /dev/null +++ b/clustering/affinity_clustering_lsi.py @@ -0,0 +1,99 @@ +import fire +from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep +from grid_sweep import grid_sweep +from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin +from dataclasses import dataclass + +@dataclass +class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin): + pass + + +class affinity_lsi_job(affinity_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__(infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + result = super().get_info() + self.result = affinity_clustering_result_lsi(**result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result + +class affinity_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + dampings=[0.9], + max_iters=[10000], + convergence_iters=[30], + preference_quantiles=[0.5]): + + super().__init__(affinity_lsi_job, + _affinity_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + dampings, + max_iters, + convergence_iters, + preference_quantiles) + + +class _affinity_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + self.lsi_dim = lsi_dim + self.jobtype = affinity_lsi_job + super().__init__(self.jobtype, + inpath, + outpath, + self.namer, + self.lsi_dim, + *args, + **kwargs) + + def namer(self, *args, **kwargs): + s = affinity_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'): + """Run affinity clustering once or more with different parameters. + + Usage: + affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters= --dampings= --preference_quantiles= --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. + outpath: path to output fit kmeans clusterings. + dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. + preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter. + convergence_iters:one or more integers of number of iterations without improvement before stopping. + max_iters: one or more numbers of different maximum interations. + lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + """ + + obj = affinity_lsi_grid_sweep(inpath, + lsi_dimensions, + outpath, + map(float,dampings), + map(int,max_iters), + map(int,convergence_iters), + map(float,preference_quantiles)) + + obj.run(1) + obj.save(savefile) + +if __name__ == "__main__": + fire.Fire(run_affinity_lsi_grid_sweep) diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py index 5492415..1d24533 100644 --- a/clustering/clustering_base.py +++ b/clustering/clustering_base.py @@ -3,59 +3,6 @@ import numpy as np import pandas as pd from dataclasses import dataclass from sklearn.metrics import silhouette_score, silhouette_samples -from itertools import product, chain -from multiprocessing import Pool, cpu_count - -def sim_to_dist(mat): - dist = 1-mat - dist[dist < 0] = 0 - np.fill_diagonal(dist,0) - return dist - -class grid_sweep: - def __init__(self, jobtype, inpath, outpath, namer, *args): - self.jobtype = jobtype - self.namer = namer - grid = list(product(*args)) - inpath = Path(inpath) - outpath = Path(outpath) - self.hasrun = False - self.grid = [(inpath,outpath,namer(*g)) + g for g in grid] - self.jobs = [jobtype(*g) for g in self.grid] - - def run(self, cores=20): - if cores is not None and cores > 1: - with Pool(cores) as pool: - infos = pool.map(self.jobtype.get_info, self.jobs) - else: - infos = map(self.jobtype.get_info, self.jobs) - - self.infos = pd.DataFrame(infos) - self.hasrun = True - - def save(self, outcsv): - if not self.hasrun: - self.run() - outcsv = Path(outcsv) - outcsv.parent.mkdir(parents=True, exist_ok=True) - self.infos.to_csv(outcsv) - - -class lsi_grid_sweep(grid_sweep): - def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs): - self.jobtype = jobtype - self.subsweep = subsweep - inpath = Path(inpath) - if lsi_dimensions == 'all': - lsi_paths = list(inpath.glob("*")) - else: - lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] - - lsi_nums = [p.stem for p in lsi_paths] - self.hasrun = False - self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] - self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) - # this is meant to be an interface, not created directly class clustering_job: @@ -86,19 +33,24 @@ class clustering_job: name=self.name, n_clusters=self.n_clusters, n_isolates=self.n_isolates, - silhouette_samples = str(self.silsampout.resolve()) + silhouette_samples = self.silsampout ) return self.result def silhouette(self): isolates = self.clustering.labels_ == -1 scoremat = self.mat[~isolates][:,~isolates] - score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') - silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') - silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) - self.outpath.mkdir(parents=True, exist_ok=True) - self.silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather") - silhouette_samp.to_feather(self.silsampout) + if scoremat.shape[0] > 0: + score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') + silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') + silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) + self.outpath.mkdir(parents=True, exist_ok=True) + silsampout = self.outpath / ("silhouette_samples-" + self.name + ".feather") + self.silsampout = silsampout.resolve() + silhouette_samp.to_feather(self.silsampout) + else: + score = None + self.silsampout = None return score def read_distance_mat(self, similarities, use_threads=True): @@ -139,11 +91,6 @@ class clustering_job: return cluster_data - -class lsi_mixin(): - def set_lsi_dims(self, lsi_dims): - self.lsi_dims = lsi_dims - @dataclass class clustering_result: outpath:Path @@ -152,7 +99,3 @@ class clustering_result: n_clusters:int n_isolates:int silhouette_samples:str - -@dataclass -class lsi_result_mixin: - lsi_dimensions:int diff --git a/clustering/grid_sweep.py b/clustering/grid_sweep.py new file mode 100644 index 0000000..636dcbc --- /dev/null +++ b/clustering/grid_sweep.py @@ -0,0 +1,32 @@ +from pathlib import Path +from multiprocessing import Pool, cpu_count +from itertools import product, chain +import pandas as pd + +class grid_sweep: + def __init__(self, jobtype, inpath, outpath, namer, *args): + self.jobtype = jobtype + self.namer = namer + grid = list(product(*args)) + inpath = Path(inpath) + outpath = Path(outpath) + self.hasrun = False + self.grid = [(inpath,outpath,namer(*g)) + g for g in grid] + self.jobs = [jobtype(*g) for g in self.grid] + + def run(self, cores=20): + if cores is not None and cores > 1: + with Pool(cores) as pool: + infos = pool.map(self.jobtype.get_info, self.jobs) + else: + infos = map(self.jobtype.get_info, self.jobs) + + self.infos = pd.DataFrame(infos) + self.hasrun = True + + def save(self, outcsv): + if not self.hasrun: + self.run() + outcsv = Path(outcsv) + outcsv.parent.mkdir(parents=True, exist_ok=True) + self.infos.to_csv(outcsv) diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index f0ee703..e533808 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -1,5 +1,5 @@ -from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat -from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep +from clustering_base import clustering_result, clustering_job +from grid_sweep import grid_sweep from dataclasses import dataclass import hdbscan from sklearn.neighbors import NearestNeighbors @@ -7,11 +7,8 @@ import plotnine as pn import numpy as np from itertools import product, starmap, chain import pandas as pd -from sklearn.metrics import silhouette_score, silhouette_samples -from pathlib import Path -from multiprocessing import Pool, cpu_count +from multiprocessing import cpu_count import fire -from pyarrow.feather import write_feather def test_select_hdbscan_clustering(): # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", @@ -40,28 +37,6 @@ def test_select_hdbscan_clustering(): # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) - -class hdbscan_lsi_grid_sweep(lsi_grid_sweep): - def __init__(self, - inpath, - lsi_dims, - outpath, - min_cluster_sizes, - min_samples, - cluster_selection_epsilons, - cluster_selection_methods - ): - - super().__init__(hdbscan_lsi_job, - _hdbscan_lsi_grid_sweep, - inpath, - lsi_dims, - outpath, - min_cluster_sizes, - min_samples, - cluster_selection_epsilons, - cluster_selection_methods) - class hdbscan_grid_sweep(grid_sweep): def __init__(self, inpath, @@ -78,25 +53,6 @@ class hdbscan_grid_sweep(grid_sweep): cluster_selection_method): return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}" - -class _hdbscan_lsi_grid_sweep(grid_sweep): - def __init__(self, - inpath, - outpath, - lsi_dim, - *args, - **kwargs): - - self.lsi_dim = lsi_dim - self.jobtype = hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) - - - def namer(self, *args, **kwargs): - s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) - s += f"_lsi-{self.lsi_dim}" - return s - @dataclass class hdbscan_clustering_result(clustering_result): min_cluster_size:int @@ -104,10 +60,6 @@ class hdbscan_clustering_result(clustering_result): cluster_selection_epsilon:float cluster_selection_method:str -@dataclass -class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): - pass - class hdbscan_job(clustering_job): def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): super().__init__(infile, @@ -148,121 +100,29 @@ class hdbscan_job(clustering_job): cluster_selection_method=self.cluster_selection_method) return self.result -class hdbscan_lsi_job(hdbscan_job, lsi_mixin): - def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): - super().__init__( - infile, - outpath, - name, - *args, - **kwargs) - super().set_lsi_dims(lsi_dims) - - def get_info(self): - partial_result = super().get_info() - self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, - lsi_dimensions=self.lsi_dims) - return self.result - -# def select_hdbscan_clustering(inpath, -# outpath, -# outfile=None, -# min_cluster_sizes=[2], -# min_samples=[1], -# cluster_selection_epsilons=[0], -# cluster_selection_methods=['eom'], -# lsi_dimensions='all' -# ): - -# inpath = Path(inpath) -# outpath = Path(outpath) -# outpath.mkdir(exist_ok=True, parents=True) +def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): + """Run hdbscan clustering once or more with different parameters. -# if lsi_dimensions is None: -# lsi_paths = [inpath] -# elif lsi_dimensions == 'all': -# lsi_paths = list(inpath.glob("*")) + Usage: + hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods= -# else: -# lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] - -# if lsi_dimensions is not None: -# lsi_nums = [p.stem for p in lsi_paths] -# else: -# lsi_nums = [None] -# grid = list(product(lsi_nums, -# min_cluster_sizes, -# min_samples, -# cluster_selection_epsilons, -# cluster_selection_methods)) - -# # fix the output file names -# names = list(map(lambda t:'_'.join(map(str,t)),grid)) - -# grid = [(inpath/(str(t[0])+'.feather'),outpath/(name + '.feather'), t[0], name) + t[1:] for t, name in zip(grid, names)] - -# with Pool(int(cpu_count()/4)) as pool: -# mods = starmap(hdbscan_clustering, grid) - -# res = pd.DataFrame(mods) -# if outfile is None: -# outfile = outpath / "selection_data.csv" - -# res.to_csv(outfile) - -# def hdbscan_clustering(similarities, output, lsi_dim, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'): -# subreddits, mat = read_similarity_mat(similarities) -# mat = sim_to_dist(mat) -# clustering = _hdbscan_clustering(mat, -# min_cluster_size=min_cluster_size, -# min_samples=min_samples, -# cluster_selection_epsilon=cluster_selection_epsilon, -# cluster_selection_method=cluster_selection_method, -# metric='precomputed', -# core_dist_n_jobs=cpu_count() -# ) - -# cluster_data = process_clustering_result(clustering, subreddits) -# isolates = clustering.labels_ == -1 -# scoremat = mat[~isolates][:,~isolates] -# score = silhouette_score(scoremat, clustering.labels_[~isolates], metric='precomputed') -# cluster_data.to_feather(output) -# silhouette_samp = silhouette_samples(mat, clustering.labels_, metric='precomputed') -# silhouette_samp = pd.DataFrame({'subreddit':subreddits,'score':silhouette_samp}) -# silsampout = output.parent / ("silhouette_samples" + output.name) -# silhouette_samp.to_feather(silsampout) - -# result = hdbscan_clustering_result(outpath=output, -# silhouette_samples=silsampout, -# silhouette_score=score, -# name=name, -# min_cluster_size=min_cluster_size, -# min_samples=min_samples, -# cluster_selection_epsilon=cluster_selection_epsilon, -# cluster_selection_method=cluster_selection_method, -# lsi_dimensions=lsi_dim, -# n_isolates=isolates.sum(), -# n_clusters=len(set(clustering.labels_)) -# ) - - - -# return(result) - -# # for all runs we should try cluster_selection_epsilon = None -# # for terms we should try cluster_selection_epsilon around 0.56-0.66 -# # for authors we should try cluster_selection_epsilon around 0.98-0.99 -# def _hdbscan_clustering(mat, *args, **kwargs): -# print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}") - -# print(mat) -# clusterer = hdbscan.HDBSCAN(*args, -# **kwargs, -# ) - -# clustering = clusterer.fit(mat.astype('double')) - -# return(clustering) + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_method: "eom" or "leaf" eom gives larger clusters. + """ + obj = hdbscan_grid_sweep(inpath, + outpath, + map(int,min_cluster_sizes), + map(int,min_samples), + map(float,cluster_selection_epsilons), + map(float,cluster_selection_methods)) + obj.run() + obj.save(savefile) def KNN_distances_plot(mat,outname,k=2): nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) @@ -293,10 +153,7 @@ def make_KNN_plots(): KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') if __name__ == "__main__": - fire.Fire{'grid_sweep':hdbscan_grid_sweep, - 'grid_sweep_lsi':hdbscan_lsi_grid_sweep - 'cluster':hdbscan_job, - 'cluster_lsi':hdbscan_lsi_job} + fire.Fire(run_hdbscan_grid_sweep) # test_select_hdbscan_clustering() #fire.Fire(select_hdbscan_clustering) diff --git a/clustering/hdbscan_clustering_lsi.py b/clustering/hdbscan_clustering_lsi.py new file mode 100644 index 0000000..73b5276 --- /dev/null +++ b/clustering/hdbscan_clustering_lsi.py @@ -0,0 +1,101 @@ +from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result +from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin +from grid_sweep import grid_sweep +import fire +from dataclasses import dataclass + +@dataclass +class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin): + pass + +class hdbscan_lsi_job(hdbscan_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__( + infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + partial_result = super().get_info() + self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result + +class hdbscan_lsi_grid_sweep(lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods + ): + + super().__init__(hdbscan_lsi_job, + _hdbscan_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + min_cluster_sizes, + min_samples, + cluster_selection_epsilons, + cluster_selection_methods) + + + +class _hdbscan_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + print(args) + print(kwargs) + + self.lsi_dim = lsi_dim + self.jobtype = hdbscan_lsi_job + super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + + + def namer(self, *args, **kwargs): + s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'],lsi_dimensions='all'): + """Run hdbscan clustering once or more with different parameters. + + Usage: + hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. + outpath: path to output fit clusterings. + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. + lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + """ + + obj = hdbscan_lsi_grid_sweep(inpath, + lsi_dimensions, + outpath, + map(int,min_cluster_sizes), + map(int,min_samples), + map(float,cluster_selection_epsilons), + cluster_selection_methods + ) + + obj.run(10) + obj.save(savefile) + + +if __name__ == "__main__": + fire.Fire(run_hdbscan_lsi_grid_sweep) diff --git a/clustering/kmeans_clustering.py b/clustering/kmeans_clustering.py index e41b88b..211b666 100644 --- a/clustering/kmeans_clustering.py +++ b/clustering/kmeans_clustering.py @@ -1,11 +1,9 @@ from sklearn.cluster import KMeans import fire from pathlib import Path -from multiprocessing import cpu_count from dataclasses import dataclass -from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat -from clustering_base import lsi_result_mixin, lsi_mixin, clustering_job, grid_sweep, lsi_grid_sweep - +from clustering_base import clustering_result, clustering_job +from grid_sweep import grid_sweep @dataclass class kmeans_clustering_result(clustering_result): @@ -13,10 +11,6 @@ class kmeans_clustering_result(clustering_result): n_init:int max_iter:int -@dataclass -class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin): - pass - class kmeans_job(clustering_job): def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True): super().__init__(infile, @@ -45,28 +39,13 @@ class kmeans_job(clustering_job): def get_info(self): result = super().get_info() self.result = kmeans_clustering_result(**result.__dict__, - n_init=n_init, - max_iter=max_iter) + n_init=self.n_init, + max_iter=self.max_iter) return self.result -class kmeans_lsi_job(kmeans_job, lsi_mixin): - def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): - super().__init__(infile, - outpath, - name, - *args, - **kwargs) - super().set_lsi_dims(lsi_dims) - - def get_info(self): - result = super().get_info() - self.result = kmeans_clustering_result_lsi(**result.__dict__, - lsi_dimensions=self.lsi_dims) - return self.result - - class kmeans_grid_sweep(grid_sweep): + def __init__(self, inpath, outpath, @@ -80,49 +59,7 @@ class kmeans_grid_sweep(grid_sweep): max_iter): return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}" -class _kmeans_lsi_grid_sweep(grid_sweep): - def __init__(self, - inpath, - outpath, - lsi_dim, - *args, - **kwargs): - self.lsi_dim = lsi_dim - self.jobtype = kmeans_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) - - def namer(self, *args, **kwargs): - s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) - s += f"_lsi-{self.lsi_dim}" - return s - -class kmeans_lsi_grid_sweep(lsi_grid_sweep): - def __init__(self, - inpath, - lsi_dims, - outpath, - n_clusters, - n_inits, - max_iters - ): - - super().__init__(kmeans_lsi_job, - _kmeans_lsi_grid_sweep, - inpath, - lsi_dims, - outpath, - n_clusters, - n_inits, - max_iters) - def test_select_kmeans_clustering(): - # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", - # "test_hdbscan_author30k", - # min_cluster_sizes=[2], - # min_samples=[1,2], - # cluster_selection_epsilons=[0,0.05,0.1,0.15], - # cluster_selection_methods=['eom','leaf'], - # lsi_dimensions='all') inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" outpath = "test_kmeans"; n_clusters=[200,300,400]; @@ -139,10 +76,30 @@ def test_select_kmeans_clustering(): gs.run(20) gs.save("test_hdbscan/lsi_sweep.csv") +def run_kmeans_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000]): + """Run kmeans clustering once or more with different parameters. + + Usage: + kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters= --n_inits= --max_iters= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + n_clusters: one or more numbers of kmeans clusters to select. + n_inits: one or more numbers of different initializations to use for each clustering. + max_iters: one or more numbers of different maximum interations. + """ + + obj = kmeans_grid_sweep(inpath, + outpath, + map(int,n_clusters), + map(int,n_inits), + map(int,max_iters)) + + + obj.run(1) + obj.save(savefile) if __name__ == "__main__": - - fire.Fire{'grid_sweep':kmeans_grid_sweep, - 'grid_sweep_lsi':kmeans_lsi_grid_sweep - 'cluster':kmeans_job, - 'cluster_lsi':kmeans_lsi_job} + fire.Fire(run_kmeans_grid_sweep) diff --git a/clustering/kmeans_clustering_lsi.py b/clustering/kmeans_clustering_lsi.py new file mode 100644 index 0000000..20d582b --- /dev/null +++ b/clustering/kmeans_clustering_lsi.py @@ -0,0 +1,93 @@ +import fire +from dataclasses import dataclass +from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep +from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep +from grid_sweep import grid_sweep + +@dataclass +class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin): + pass + +class kmeans_lsi_job(kmeans_job, lsi_mixin): + def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs): + super().__init__(infile, + outpath, + name, + *args, + **kwargs) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + result = super().get_info() + self.result = kmeans_clustering_result_lsi(**result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result + +class _kmeans_lsi_grid_sweep(grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + *args, + **kwargs): + print(args) + print(kwargs) + self.lsi_dim = lsi_dim + self.jobtype = kmeans_lsi_job + super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + + def namer(self, *args, **kwargs): + s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +class kmeans_lsi_grid_sweep(lsi_grid_sweep): + + def __init__(self, + inpath, + lsi_dims, + outpath, + n_clusters, + n_inits, + max_iters + ): + + super().__init__(kmeans_lsi_job, + _kmeans_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + n_clusters, + n_inits, + max_iters) + +def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath, n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"): + """Run kmeans clustering once or more with different parameters. + + Usage: + kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters= --n_inits= --max_iters= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. + outpath: path to output fit kmeans clusterings. + lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + n_clusters: one or more numbers of kmeans clusters to select. + n_inits: one or more numbers of different initializations to use for each clustering. + max_iters: one or more numbers of different maximum interations. + """ + + obj = kmeans_lsi_grid_sweep(inpath, + lsi_dimensions, + outpath, + list(map(int,n_clusters)), + list(map(int,n_inits)), + list(map(int,max_iters)) + ) + + obj.run(1) + obj.save(savefile) + + +if __name__ == "__main__": + fire.Fire(run_kmeans_lsi_grid_sweep) diff --git a/clustering/lsi_base.py b/clustering/lsi_base.py new file mode 100644 index 0000000..45cc49b --- /dev/null +++ b/clustering/lsi_base.py @@ -0,0 +1,28 @@ +from clustering_base import clustering_job, clustering_result +from grid_sweep import grid_sweep +from dataclasses import dataclass +from itertools import chain +from pathlib import Path + +class lsi_mixin(): + def set_lsi_dims(self, lsi_dims): + self.lsi_dims = lsi_dims + +@dataclass +class lsi_result_mixin: + lsi_dimensions:int + +class lsi_grid_sweep(grid_sweep): + def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs): + self.jobtype = jobtype + self.subsweep = subsweep + inpath = Path(inpath) + if lsi_dimensions == 'all': + lsi_paths = list(inpath.glob("*")) + else: + lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] + + lsi_nums = [p.stem for p in lsi_paths] + self.hasrun = False + self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] + self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) From 47ba04aa9715325a67fe17cee205230b042022fe Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Mon, 10 May 2021 18:24:22 -0700 Subject: [PATCH 08/22] add script for pulling cluster timeseries --- timeseries/cluster_timeseries.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py index 07507d7..91fa705 100644 --- a/timeseries/cluster_timeseries.py +++ b/timeseries/cluster_timeseries.py @@ -2,11 +2,11 @@ import pandas as pd import numpy as np from pyspark.sql import functions as f from pyspark.sql import SparkSession -from choose_clusters import load_clusters, load_densities +from .choose_clusters import load_clusters, load_densities import fire from pathlib import Path -def main(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather", +def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather", author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather", term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather", author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", @@ -34,4 +34,4 @@ def main(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_ ts.write.parquet(output, mode='overwrite') if __name__ == "__main__": - fire.Fire(main) + fire.Fire(build_cluster_timeseries) From 582cf263eaec21a7c337400c5f601107318ab0f2 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Thu, 13 May 2021 22:26:03 -0700 Subject: [PATCH 09/22] bug fix in affinity clustering --- clustering/Makefile | 2 +- clustering/affinity_clustering.py | 4 ++-- clustering/affinity_clustering_lsi.py | 6 +++--- clustering/clustering_base.py | 12 ++++++++---- clustering/fit_tsne.py | 4 ++-- clustering/lsi_base.py | 4 ++-- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/clustering/Makefile b/clustering/Makefile index 7e8cf39..69c6c15 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -4,7 +4,7 @@ similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" -affinity_selection_grid="--dampings=[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[30]" +affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]" authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI diff --git a/clustering/affinity_clustering.py b/clustering/affinity_clustering.py index d10628a..737967e 100644 --- a/clustering/affinity_clustering.py +++ b/clustering/affinity_clustering.py @@ -81,7 +81,7 @@ class affinity_grid_sweep(grid_sweep): return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}" -def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5]): +def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5],n_cores=10): """Run affinity clustering once or more with different parameters. Usage: @@ -102,7 +102,7 @@ def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters map(int,max_iters), map(int,convergence_iters), map(float,preference_quantiles)) - obj.run(1) + obj.run(n_cores) obj.save(savefile) def test_select_affinity_clustering(): diff --git a/clustering/affinity_clustering_lsi.py b/clustering/affinity_clustering_lsi.py index f5c794e..983e861 100644 --- a/clustering/affinity_clustering_lsi.py +++ b/clustering/affinity_clustering_lsi.py @@ -58,7 +58,7 @@ class _affinity_lsi_grid_sweep(grid_sweep): inpath, outpath, self.namer, - self.lsi_dim, + [self.lsi_dim], *args, **kwargs) @@ -67,7 +67,7 @@ class _affinity_lsi_grid_sweep(grid_sweep): s += f"_lsi-{self.lsi_dim}" return s -def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all'): +def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30): """Run affinity clustering once or more with different parameters. Usage: @@ -92,7 +92,7 @@ def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_i map(int,convergence_iters), map(float,preference_quantiles)) - obj.run(1) + obj.run(n_cores) obj.save(savefile) if __name__ == "__main__": diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py index 1d24533..3778fc3 100644 --- a/clustering/clustering_base.py +++ b/clustering/clustering_base.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd from dataclasses import dataclass from sklearn.metrics import silhouette_score, silhouette_samples +from collections import Counter # this is meant to be an interface, not created directly class clustering_job: @@ -38,9 +39,11 @@ class clustering_job: return self.result def silhouette(self): - isolates = self.clustering.labels_ == -1 + counts = Counter(self.clustering.labels_) + singletons = [key for key, value in counts.items() if value == 1] + isolates = (self.clustering.labels_ == -1) | (np.isin(self.clustering.labels_,np.array(singletons))) scoremat = self.mat[~isolates][:,~isolates] - if scoremat.shape[0] > 0: + if self.n_clusters > 1: score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed') silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed') silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp}) @@ -80,8 +83,9 @@ class clustering_job: print(f"{n_isolates1} clusters have 1 member") - n_isolates2 = (cluster_sizes.loc[cluster_sizes.cluster==-1,['subreddit']]) - + n_isolates2 = cluster_sizes.loc[cluster_sizes.cluster==-1,:]['subreddit'].to_list() + if len(n_isolates2) > 0: + n_isloates2 = n_isolates2[0] print(f"{n_isolates2} subreddits are in cluster -1",flush=True) if n_isolates1 == 0: diff --git a/clustering/fit_tsne.py b/clustering/fit_tsne.py index c9f45f6..55d7239 100644 --- a/clustering/fit_tsne.py +++ b/clustering/fit_tsne.py @@ -17,7 +17,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000 df = pd.read_feather(similarities) n = df.shape[0] - mat = np.array(df.drop('subreddit',1),dtype=np.float64) + mat = np.array(df.drop('_subreddit',1),dtype=np.float64) mat[range(n),range(n)] = 1 mat[mat > 1] = 1 dist = 2*np.arccos(mat)/np.pi @@ -26,7 +26,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000 tsne_fit_whole = tsne_fit_model.fit_transform(dist) - plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit}) + plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']}) plot_data.to_feather(output) diff --git a/clustering/lsi_base.py b/clustering/lsi_base.py index 45cc49b..f07bca6 100644 --- a/clustering/lsi_base.py +++ b/clustering/lsi_base.py @@ -20,9 +20,9 @@ class lsi_grid_sweep(grid_sweep): if lsi_dimensions == 'all': lsi_paths = list(inpath.glob("*")) else: - lsi_paths = [inpath / (dim + '.feather') for dim in lsi_dimensions] + lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions] - lsi_nums = [p.stem for p in lsi_paths] + lsi_nums = [int(p.stem) for p in lsi_paths] self.hasrun = False self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) From 0b95bea30eebe7660013a799bd09f4564d025ddc Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Thu, 13 May 2021 22:26:58 -0700 Subject: [PATCH 10/22] support isolates in visualization --- ngrams/tf_comments.py | 16 ++++++--------- similarities/Makefile | 4 ++-- similarities/similarities_helper.py | 7 ++----- visualization/tsne_vis.py | 32 ++++++++++++++++++++--------- 4 files changed, 32 insertions(+), 27 deletions(-) diff --git a/ngrams/tf_comments.py b/ngrams/tf_comments.py index f86548a..a40e5d9 100755 --- a/ngrams/tf_comments.py +++ b/ngrams/tf_comments.py @@ -13,10 +13,7 @@ from nltk.corpus import stopwords from nltk.util import ngrams import string from random import random - -# remove urls -# taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url -urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)") +from redditcleaner import clean # compute term frequencies for comments in each subreddit by week def weekly_tf(partition, mwe_pass = 'first'): @@ -95,8 +92,8 @@ def weekly_tf(partition, mwe_pass = 'first'): # lowercase text = text.lower() - # remove urls - text = urlregex.sub("", text) + # redditcleaner removes reddit markdown(newlines, quotes, bullet points, links, strikethrough, spoiler, code, superscript, table, headings) + text = clean(text) # sentence tokenize sentences = sent_tokenize(text) @@ -107,14 +104,13 @@ def weekly_tf(partition, mwe_pass = 'first'): # remove punctuation sentences = map(remove_punct, sentences) - - # remove sentences with less than 2 words - sentences = filter(lambda sentence: len(sentence) > 2, sentences) - # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase. # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms # here we take a 10 percent sample of sentences if mwe_pass == 'first': + + # remove sentences with less than 2 words + sentences = filter(lambda sentence: len(sentence) > 2, sentences) sentences = list(sentences) for sentence in sentences: if random() <= 0.1: diff --git a/similarities/Makefile b/similarities/Makefile index cfe8a49..f578fd5 100644 --- a/similarities/Makefile +++ b/similarities/Makefile @@ -1,7 +1,7 @@ #all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh -base_data=/gscratch/comdata/output/ +base_data=/gscratch/comdata/output similarity_data=${base_data}/reddit_similarity tfidf_data=${similarity_data}/tfidf tfidf_weekly_data=${similarity_data}/tfidf_weekly @@ -97,7 +97,7 @@ ${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/outpu start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=100000 --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet + start_spark_and_run.sh 2 tfidf.py terms_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 7f8a639..e59563e 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -23,9 +23,6 @@ class tf_weight(Enum): infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet" cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet" -def termauthor_tfidf(term_tfidf_callable, author_tfidf_callable): - - # subreddits missing after this step don't have any terms that have a high enough idf # try rewriting without merges def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF): @@ -283,7 +280,7 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig df = df.withColumn("tf_idf", (0.5 + 0.5 * df.relative_tf) * df.idf) df = df.repartition(400,'subreddit','week') - dfwriter = df.write.partitionBy("week").sortBy("subreddit") + dfwriter = df.write.partitionBy("week") return dfwriter def _calc_tfidf(df, term_colname, tf_family): @@ -339,7 +336,7 @@ def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm df = _calc_tfidf(df, term_colname, tf_family) df = df.repartition('subreddit') - dfwriter = df.write.sortBy("subreddit","tf") + dfwriter = df.write return dfwriter def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): diff --git a/visualization/tsne_vis.py b/visualization/tsne_vis.py index c39a740..eb6a6be 100644 --- a/visualization/tsne_vis.py +++ b/visualization/tsne_vis.py @@ -22,8 +22,12 @@ def base_plot(plot_data): # # subreddit_select = alt.selection_single(on='click',fields=['subreddit'],bind=subreddit_dropdown,name='subreddit_click') + base_scale = alt.Scale(scheme={"name":'category10', + "extent":[0,100], + "count":10}) + color = alt.condition(cluster_click_select , - alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')), + alt.Color(field='color',type='nominal',scale=base_scale), alt.value("lightgray")) @@ -84,6 +88,11 @@ def viewport_plot(plot_data): return chart def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4): + isolate_color = 101 + + cluster_sizes = clusters.groupby('cluster').count() + singletons = set(cluster_sizes.loc[cluster_sizes.subreddit == 1].reset_index().cluster) + tsne_data = tsne_data.merge(clusters,on='subreddit') centroids = tsne_data.groupby('cluster').agg({'x':np.mean,'y':np.mean}) @@ -120,15 +129,17 @@ def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4): color_assignments = np.repeat(-1,len(centroids)) for i in range(len(centroids)): - knn = indices[i] - knn_colors = color_assignments[knn] - available_colors = color_ids[list(set(color_ids) - set(knn_colors))] - - if(len(available_colors) > 0): - color_assignments[i] = available_colors[0] + if (centroids.iloc[i].name == -1) or (i in singletons): + color_assignments[i] = isolate_color else: - raise Exception("Can't color this many neighbors with this many colors") + knn = indices[i] + knn_colors = color_assignments[knn] + available_colors = color_ids[list(set(color_ids) - set(knn_colors))] + if(len(available_colors) > 0): + color_assignments[i] = available_colors[0] + else: + raise Exception("Can't color this many neighbors with this many colors") centroids = centroids.reset_index() colors = centroids.loc[:,['cluster']] @@ -143,12 +154,13 @@ def build_visualization(tsne_data, clusters, output): # clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather" tsne_data = pd.read_feather(tsne_data) + tsne_data = tsne_data.rename(columns={'_subreddit':'subreddit'}) clusters = pd.read_feather(clusters) tsne_data = assign_cluster_colors(tsne_data,clusters,10,8) - # sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index() - # sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'}) + sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index() + sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'}) tsne_data = tsne_data.merge(sr_per_cluster,on='cluster') From 87ffaa6858919bd830694d60dd4fc7b1857b462a Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Fri, 14 May 2021 19:10:36 -0700 Subject: [PATCH 11/22] script for picking the best clustering given constraints --- clustering/pick_best_clustering.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 clustering/pick_best_clustering.py diff --git a/clustering/pick_best_clustering.py b/clustering/pick_best_clustering.py new file mode 100644 index 0000000..91c443e --- /dev/null +++ b/clustering/pick_best_clustering.py @@ -0,0 +1,29 @@ +import fire +import pandas as pd +from pathlib import Path +import shutil + +selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv" + +outpath = 'test_best.feather' + +# pick the best clustering according to silhouette score subject to contraints +def pick_best_clustering(selection_data, output, min_clusters, max_isolates): + df = pd.read_csv(selection_data,index_col=0) + df = df.sort_values("silhouette_score") + + # not sure I fixed the bug underlying this fully or not. + df['n_isolates_str'] = df.n_isolates.str.strip("[]") + df['n_isolates_0'] = df['n_isolates_str'].apply(lambda l: len(l) == 0) + df.loc[df.n_isolates_0,'n_isolates'] = 0 + df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l)) + + best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]] + + print(best_cluster.to_dict()) + best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather") + + shutil.copy(best_path,output) + +if __name__ == "__main__": + fire.Fire(pick_best_clustering) From cf86c7492c27801638fcccabd2a39e3213e47cc9 Mon Sep 17 00:00:00 2001 From: Nate E TeBlunthuis Date: Tue, 3 Aug 2021 14:55:02 -0700 Subject: [PATCH 12/22] update clustering scripts --- clustering/Makefile | 10 +++-- clustering/grid_sweep.py | 1 + clustering/hdbscan_clustering_lsi.py | 8 ++-- clustering/kmeans_clustering_lsi.py | 2 +- clustering/pick_best_clustering.py | 11 +++--- clustering/selection.py | 43 +++++++++++++++++++--- density/Makefile | 6 +++ density/job_script.sh | 4 +- density/overlap_density.py | 9 ++++- similarities/job_script.sh | 2 +- similarities/top_subreddits_by_comments.py | 4 +- 11 files changed, 73 insertions(+), 27 deletions(-) diff --git a/clustering/Makefile b/clustering/Makefile index 69c6c15..9643f52 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -2,9 +2,9 @@ srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering -kmeans_selection_grid="--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]" -hdbscan_selection_grid="--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=eom,leaf" -affinity_selection_grid="--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]" +kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] +hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] +affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI @@ -91,7 +91,11 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) +${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py + $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 +${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py + $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 clean_affinity: rm -f ${authors_10k_output}/affinity/selection_data.csv diff --git a/clustering/grid_sweep.py b/clustering/grid_sweep.py index 636dcbc..c0365d0 100644 --- a/clustering/grid_sweep.py +++ b/clustering/grid_sweep.py @@ -7,6 +7,7 @@ class grid_sweep: def __init__(self, jobtype, inpath, outpath, namer, *args): self.jobtype = jobtype self.namer = namer + print(*args) grid = list(product(*args)) inpath = Path(inpath) outpath = Path(outpath) diff --git a/clustering/hdbscan_clustering_lsi.py b/clustering/hdbscan_clustering_lsi.py index 73b5276..cbd44bd 100644 --- a/clustering/hdbscan_clustering_lsi.py +++ b/clustering/hdbscan_clustering_lsi.py @@ -59,7 +59,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep): self.lsi_dim = lsi_dim self.jobtype = hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs) def namer(self, *args, **kwargs): @@ -87,9 +87,9 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2] obj = hdbscan_lsi_grid_sweep(inpath, lsi_dimensions, outpath, - map(int,min_cluster_sizes), - map(int,min_samples), - map(float,cluster_selection_epsilons), + list(map(int,min_cluster_sizes)), + list(map(int,min_samples)), + list(map(float,cluster_selection_epsilons)), cluster_selection_methods ) diff --git a/clustering/kmeans_clustering_lsi.py b/clustering/kmeans_clustering_lsi.py index 20d582b..bb006f3 100644 --- a/clustering/kmeans_clustering_lsi.py +++ b/clustering/kmeans_clustering_lsi.py @@ -34,7 +34,7 @@ class _kmeans_lsi_grid_sweep(grid_sweep): print(kwargs) self.lsi_dim = lsi_dim self.jobtype = kmeans_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, self.lsi_dim, *args, **kwargs) + super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs) def namer(self, *args, **kwargs): s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs) diff --git a/clustering/pick_best_clustering.py b/clustering/pick_best_clustering.py index 91c443e..c541d23 100644 --- a/clustering/pick_best_clustering.py +++ b/clustering/pick_best_clustering.py @@ -2,15 +2,15 @@ import fire import pandas as pd from pathlib import Path import shutil - -selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/affinity/selection_data.csv" +selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv" outpath = 'test_best.feather' +min_clusters=50; max_isolates=5000; min_cluster_size=2 # pick the best clustering according to silhouette score subject to contraints -def pick_best_clustering(selection_data, output, min_clusters, max_isolates): +def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size): df = pd.read_csv(selection_data,index_col=0) - df = df.sort_values("silhouette_score") + df = df.sort_values("silhouette_score",ascending=False) # not sure I fixed the bug underlying this fully or not. df['n_isolates_str'] = df.n_isolates.str.strip("[]") @@ -18,11 +18,10 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates): df.loc[df.n_isolates_0,'n_isolates'] = 0 df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l)) - best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)].iloc[df.shape[1]] + best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]] print(best_cluster.to_dict()) best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather") - shutil.copy(best_path,output) if __name__ == "__main__": diff --git a/clustering/selection.py b/clustering/selection.py index d2fa6de..81641db 100644 --- a/clustering/selection.py +++ b/clustering/selection.py @@ -1,7 +1,38 @@ -import fire -from select_affinity import select_affinity_clustering -from select_kmeans import select_kmeans_clustering +import pandas as pd +import plotnine as pn +from pathlib import Path +from clustering.fit_tsne import fit_tsne +from visualization.tsne_vis import build_visualization + +df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0) + +# plot silhouette_score as a function of isolates +df = df.sort_values("silhouette_score") + +df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1])) +p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point() +p.save("isolates_x_score.png") + +p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point() +p.save("clusters_x_isolates.png") + +# the best result for hdbscan seems like this one: it has a decent number of +# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables +best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]] + +best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]] + +tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather") + +if not tnse_data.exists(): + fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather", + tnse_data) + +build_visualization("./clustering/authors-tf_lsi850_tsne.feather", + Path(best_eom.outpath)/(best_eom['name']+'.feather'), + "./authors-tf_lsi850_best_eom.html") + +build_visualization("./clustering/authors-tf_lsi850_tsne.feather", + Path(best_leaf.outpath)/(best_leaf['name']+'.feather'), + "./authors-tf_lsi850_best_leaf.html") -if __name__ == "__main__": - fire.Fire({"kmeans":select_kmeans_clustering, - "affinity":select_affinity_clustering}) diff --git a/density/Makefile b/density/Makefile index d223399..90eba82 100644 --- a/density/Makefile +++ b/density/Makefile @@ -8,3 +8,9 @@ all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscrat /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum + +/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum + +/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather + start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum diff --git a/density/job_script.sh b/density/job_script.sh index 7dfac14..e411ba7 100755 --- a/density/job_script.sh +++ b/density/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum -stop-all.sh +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh diff --git a/density/overlap_density.py b/density/overlap_density.py index 5a8e91a..2036824 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -1,11 +1,12 @@ import pandas as pd from pandas.core.groupby import DataFrameGroupBy as GroupBy +from pathlib import Path import fire import numpy as np import sys sys.path.append("..") sys.path.append("../similarities") -from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval +from similarities.similarities_helper import reindex_tfidf # this is the mean of the ratio of the overlap to the focal size. # mean shared membership per focal community member @@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i def overlap_density(inpath, outpath, agg = pd.DataFrame.sum): df = pd.read_feather(inpath) - df = df.drop('subreddit',1) + df = df.drop('_subreddit',1) np.fill_diagonal(df.values,0) df = agg(df, 0).reset_index() df = df.rename({0:'overlap_density'},axis='columns') + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) df.to_feather(outpath) return df @@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum): # exclude the diagonal df = df.loc[df.subreddit != df.variable] res = agg(df.groupby(['subreddit','week'])).reset_index() + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok = True) res.to_feather(outpath) return res diff --git a/similarities/job_script.sh b/similarities/job_script.sh index 1f363cd..0c37103 100755 --- a/similarities/job_script.sh +++ b/similarities/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 lsi_similarities.py author --outfile=/gscratch/comdata/output//reddit_similarity/subreddit_comment_authors_10k_LSI.feather --topN=10000 +singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh diff --git a/similarities/top_subreddits_by_comments.py b/similarities/top_subreddits_by_comments.py index 1197b51..ff9293c 100644 --- a/similarities/top_subreddits_by_comments.py +++ b/similarities/top_subreddits_by_comments.py @@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%")) df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments")) df = df.join(prop_nsfw,on='subreddit') -df = df.filter(df.prop_nsfw < 0.5) +#df = df.filter(df.prop_nsfw < 0.5) win = Window.orderBy(f.col('n_comments').desc()) df = df.withColumn('comments_rank', f.rank().over(win)) @@ -26,4 +26,4 @@ df = df.toPandas() df = df.sort_values("n_comments") -df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False) +df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False) From 6e43294a41e030e557d7e612f1e6ddb063482689 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 3 Aug 2021 15:06:48 -0700 Subject: [PATCH 13/22] Updates to similarities code for smap project. --- similarities/cosine_similarities.py | 2 +- similarities/lsi_similarities.py | 48 ++++++--- similarities/similarities_helper.py | 118 +++++++++++++-------- similarities/tfidf.py | 21 ++-- similarities/weekly_cosine_similarities.py | 73 ++++++++----- 5 files changed, 171 insertions(+), 91 deletions(-) diff --git a/similarities/cosine_similarities.py b/similarities/cosine_similarities.py index 8b85692..98f1454 100644 --- a/similarities/cosine_similarities.py +++ b/similarities/cosine_similarities.py @@ -6,7 +6,7 @@ from functools import partial def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): - return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + return similarities(inpath=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) # change so that these take in an input as an optional argument (for speed, but also for idf). def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None): diff --git a/similarities/lsi_similarities.py b/similarities/lsi_similarities.py index 7ab7e8c..eb89f55 100644 --- a/similarities/lsi_similarities.py +++ b/similarities/lsi_similarities.py @@ -1,20 +1,41 @@ import pandas as pd import fire from pathlib import Path -from similarities_helper import similarities, lsi_column_similarities +from similarities_helper import * +#from similarities_helper import similarities, lsi_column_similarities from functools import partial -def lsi_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack'): +inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_terms_compex.parquet/" +term_colname='term' +outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI' +n_components=[10,50,100] +included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" +n_iter=5 +random_state=1968 +algorithm='arpack' +topN = None +from_date=None +to_date=None +min_df=None +max_df=None +def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack',lsi_model=None): print(n_components,flush=True) - simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm) + + if lsi_model is None: + if type(n_components) == list: + lsi_model = Path(outfile) / f'{max(n_components)}_{term_colname}_LSIMOD.pkl' + else: + lsi_model = Path(outfile) / f'{n_components}_{term_colname}_LSIMOD.pkl' - return similarities(infile=infile, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) + simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model) + + return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname) # change so that these take in an input as an optional argument (for speed, but also for idf). -def term_lsi_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): +def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968): - return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', + res = lsi_similarities(inpath, 'term', outfile, min_df, @@ -23,11 +44,13 @@ def term_lsi_similarities(outfile, min_df=None, max_df=None, included_subreddits topN, from_date, to_date, - n_components=n_components + n_components=n_components, + algorithm = algorithm ) + return res -def author_lsi_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): - return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', +def author_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968): + return lsi_similarities(inpath, 'author', outfile, min_df, @@ -39,8 +62,8 @@ def author_lsi_similarities(outfile, min_df=2, max_df=None, included_subreddits= n_components=n_components ) -def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968,algorithm='arpack'): - return lsi_similarities('/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', +def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968): + return lsi_similarities(inpath, 'author', outfile, min_df, @@ -50,7 +73,8 @@ def author_tf_similarities(outfile, min_df=2, max_df=None, included_subreddits=N from_date=from_date, to_date=to_date, tfidf_colname='relative_tf', - n_components=n_components + n_components=n_components, + algorithm=algorithm ) diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index e59563e..a4983b3 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -15,24 +15,53 @@ import numpy as np import pathlib from datetime import datetime from pathlib import Path +import pickle class tf_weight(Enum): MaxTF = 1 Norm05 = 2 -infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet" -cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet" +# infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet" +# cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet" # subreddits missing after this step don't have any terms that have a high enough idf # try rewriting without merges -def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF): - print("loading tfidf", flush=True) - tfidf_ds = ds.dataset(infile) + +# does reindex_tfidf, but without reindexing. +def reindex_tfidf(*args, **kwargs): + df, tfidf_ds, ds_filter = _pull_or_reindex_tfidf(*args, **kwargs, reindex=True) + + print("assigning names") + subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id']) + batches = subreddit_names.to_batches() + + with Pool(cpu_count()) as pool: + chunks = pool.imap_unordered(pull_names,batches) + subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() + subreddit_names = subreddit_names.set_index("subreddit_id") + + new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() + new_ids = new_ids.set_index('subreddit_id') + subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() + subreddit_names = subreddit_names.drop("subreddit_id",1) + subreddit_names = subreddit_names.sort_values("subreddit_id_new") + return(df, subreddit_names) + +def pull_tfidf(*args, **kwargs): + df, _, _ = _pull_or_reindex_tfidf(*args, **kwargs, reindex=False) + return df + +def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True): + print(f"loading tfidf {infile}", flush=True) + if week is not None: + tfidf_ds = ds.dataset(infile, partitioning='hive') + else: + tfidf_ds = ds.dataset(infile) if included_subreddits is None: included_subreddits = select_topN_subreddits(topN) else: - included_subreddits = set(open(included_subreddits)) + included_subreddits = set(map(str.strip,open(included_subreddits))) ds_filter = ds.field("subreddit").isin(included_subreddits) @@ -68,15 +97,20 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} - tfidf_ds = ds.dataset(infile) - df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = df.to_pandas(split_blocks=True,self_destruct=True) print("assigning indexes",flush=True) - df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() - grouped = df.groupby(term_id) - df[term_id_new] = grouped.ngroup() + if reindex: + df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + else: + df['subreddit_id_new'] = df['subreddit_id'] + + if reindex: + grouped = df.groupby(term_id) + df[term_id_new] = grouped.ngroup() + else: + df[term_id_new] = df[term_id] if rescale_idf: print("computing idf", flush=True) @@ -88,26 +122,13 @@ def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subre else: # tf_fam = tf_weight.Norm05 df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf - print("assigning names") - subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id']) - batches = subreddit_names.to_batches() + return (df, tfidf_ds, ds_filter) - with Pool(cpu_count()) as pool: - chunks = pool.imap_unordered(pull_names,batches) - subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() - - subreddit_names = subreddit_names.set_index("subreddit_id") - new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() - new_ids = new_ids.set_index('subreddit_id') - subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() - subreddit_names = subreddit_names.drop("subreddit_id",1) - subreddit_names = subreddit_names.sort_values("subreddit_id_new") - return(df, subreddit_names) def pull_names(batch): return(batch.to_pandas().drop_duplicates()) -def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): +def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'): ''' tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities. ''' @@ -127,7 +148,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non output_feather = Path(str(p).replace("".join(p.suffixes), ".feather")) output_csv = Path(str(p).replace("".join(p.suffixes), ".csv")) output_parquet = Path(str(p).replace("".join(p.suffixes), ".parquet")) - outfile.parent.mkdir(exist_ok=True, parents=True) + p.parent.mkdir(exist_ok=True, parents=True) sims.to_feather(outfile) @@ -135,7 +156,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non term_id = term + '_id' term_id_new = term + '_id_new' - entries, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date) + entries, subreddit_names = reindex_tfidf(inpath, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date) mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new))) print("loading matrix") @@ -144,6 +165,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non print(f'computing similarities on mat. mat.shape:{mat.shape}') print(f"size of mat is:{mat.data.nbytes}",flush=True) + # transform this to debug term tfidf sims = simfunc(mat) del mat @@ -151,7 +173,7 @@ def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=Non for simmat, name in sims: proc_sims(simmat, Path(outfile)/(str(name) + ".feather")) else: - proc_sims(simmat, outfile) + proc_sims(sims, outfile) def write_weekly_similarities(path, sims, week, names): sims['week'] = week @@ -204,7 +226,7 @@ def test_lsi_sims(): # if n_components is a list we'll return a list of similarities with different latent dimensionalities # if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations. # this function takes the svd and then the column similarities of it -def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized'): +def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None): # first compute the lsi of the matrix # then take the column similarities print("running LSI",flush=True) @@ -215,21 +237,32 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 n_components = sorted(n_components,reverse=True) svd_components = n_components[0] - svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) - mod = svd.fit(tfidfmat.T) + + if lsi_model_load is not None: + mod = pickle.load(open(lsi_model_load ,'rb')) + + else: + svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) + mod = svd.fit(tfidfmat.T) + lsimat = mod.transform(tfidfmat.T) + if lsi_model_save is not None: + pickle.dump(mod, open(lsi_model_save,'wb')) + + sims_list = [] for n_dims in n_components: sims = column_similarities(lsimat[:,np.arange(n_dims)]) if len(n_components) > 1: yield (sims, n_dims) else: return sims - def column_similarities(mat): return 1 - pairwise_distances(mat,metric='cosine') - +# need to rewrite this so that subreddit ids and term ids are fixed over the whole thing. +# this affords taking the LSI similarities. +# fill all 0s if we don't have it. def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): term = term_colname term_id = term + '_id' @@ -254,20 +287,21 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig idf = idf.withColumn('idf',f.log(idf.subreddits_in_week) / (1+f.col('count'))+1) # collect the dictionary to make a pydict of terms to indexes - terms = idf.select([term,'week']).distinct() # terms are distinct + terms = idf.select([term]).distinct() # terms are distinct - terms = terms.withColumn(term_id,f.row_number().over(Window.partitionBy('week').orderBy(term))) # term ids are distinct + terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct # make subreddit ids - subreddits = df.select(['subreddit','week']).distinct() - subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.partitionBy("week").orderBy("subreddit"))) + subreddits = df.select(['subreddit']).distinct() + subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit"))) - df = df.join(subreddits,on=['subreddit','week']) + # df = df.cache() + df = df.join(subreddits,on=['subreddit']) # map terms to indexes in the tfs and the idfs - df = df.join(terms,on=[term,'week']) # subreddit-term-id is unique + df = df.join(terms,on=[term]) # subreddit-term-id is unique - idf = idf.join(terms,on=[term,'week']) + idf = idf.join(terms,on=[term]) # join on subreddit/term to create tf/dfs indexed by term df = df.join(idf, on=[term_id, term,'week']) @@ -327,7 +361,7 @@ def _calc_tfidf(df, term_colname, tf_family): return df -def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): +def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): term = term_colname term_id = term + '_id' # aggregate counts by week. now subreddit-term is distinct diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 002e89f..94dcbf5 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -1,7 +1,7 @@ import fire from pyspark.sql import SparkSession from pyspark.sql import functions as f -from similarities_helper import build_tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits +from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits): spark = SparkSession.builder.getOrCreate() @@ -11,7 +11,7 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_ df = df.filter(~ f.col(term_colname).isin(exclude)) if included_subreddits is not None: - include_subs = list(open(included_subreddits)) + include_subs = set(map(str.strip,open(included_subreddits))) else: include_subs = select_topN_subreddits(topN) @@ -21,42 +21,45 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_ spark.stop() def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits): - return _tfidf_wrapper(build_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) + return _tfidf_wrapper(tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits): return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', - topN=25000): + topN=None, + included_subreddits=None): return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", outpath, topN, 'author', ['[deleted]','AutoModerator'], - included_subreddits=None + included_subreddits=included_subreddits ) def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', - topN=25000): + topN=None, + included_subreddits=None): return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", outpath, topN, 'term', [], - included_subreddits=None + included_subreddits=included_subreddits ) def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', - topN=25000): + topN=None, + include_subreddits=None): return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", outpath, topN, 'author', ['[deleted]','AutoModerator'], - included_subreddits=None + included_subreddits=included_subreddits ) def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index e24ceee..7cafcb9 100644 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -8,32 +8,47 @@ import pandas as pd import fire from itertools import islice, chain from pathlib import Path -from similarities_helper import * +from similarities_helper import pull_tfidf, column_similarities, write_weekly_similarities +from scipy.sparse import csr_matrix from multiprocessing import Pool, cpu_count from functools import partial +# infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet" +# tfidf_path = infile +# min_df=None +# max_df = None +# topN=100 +# term_colname='author' +# outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet' +# included_subreddits=None -def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, included_subreddits, topN, outdir:Path): +def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, included_subreddits, topN, outdir:Path, subreddit_names, nterms): term = term_colname term_id = term + '_id' term_id_new = term + '_id_new' print(f"loading matrix: {week}") - entries, subreddit_names = reindex_tfidf(infile = tfidf_path, - term_colname=term_colname, - min_df=min_df, - max_df=max_df, - included_subreddits=included_subreddits, - topN=topN, - week=week) - mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new))) + + entries = pull_tfidf(infile = tfidf_path, + term_colname=term_colname, + min_df=min_df, + max_df=max_df, + included_subreddits=included_subreddits, + topN=topN, + week=week.isoformat(), + rescale_idf=False) + + tfidf_colname='tf_idf' + # if the max subreddit id we found is less than the number of subreddit names then we have to fill in 0s + mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)),shape=(nterms,subreddit_names.shape[0])) + print('computing similarities') - sims = column_similarities(mat) + sims = simfunc(mat.T) del mat - sims = pd.DataFrame(sims.todense()) + sims = pd.DataFrame(sims) sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) - sims['_subreddit'] = names.subreddit.values + sims['_subreddit'] = subreddit_names.subreddit.values outfile = str(Path(outdir) / str(week)) - write_weekly_similarities(outfile, sims, week, names) + write_weekly_similarities(outfile, sims, week, subreddit_names) def pull_weeks(batch): return set(batch.to_pandas()['week']) @@ -41,25 +56,29 @@ def pull_weeks(batch): #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet') def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500): print(outfile) - tfidf_ds = ds.dataset(tfidf_path) - tfidf_ds = tfidf_ds.to_table(columns=["week"]) - batches = tfidf_ds.to_batches() - - with Pool(cpu_count()) as pool: - weeks = set(chain( * pool.imap_unordered(pull_weeks,batches))) - - weeks = sorted(weeks) # do this step in parallel if we have the memory for it. # should be doable with pool.map - print(f"computing weekly similarities") - week_similarities_helper = partial(_week_similarities,simfunc=column_similarities, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN) + spark = SparkSession.builder.getOrCreate() + df = spark.read.parquet(tfidf_path) + subreddit_names = df.select(['subreddit','subreddit_id']).distinct().toPandas() + subreddit_names = subreddit_names.sort_values("subreddit_id") + nterms = df.select(f.max(f.col(term_colname + "_id")).alias('max')).collect()[0].max + weeks = df.select(f.col("week")).distinct().toPandas().week.values + spark.stop() + + print(f"computing weekly similarities") + week_similarities_helper = partial(_week_similarities,simfunc=column_similarities, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN, subreddit_names=subreddit_names,nterms=nterms) + + pool = Pool(cpu_count()) + + list(pool.imap(week_similarities_helper,weeks)) + pool.close() + # with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine? - with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine? - list(pool.map(week_similarities_helper,weeks)) def author_cosine_similarities_weekly(outfile, min_df=2, max_df=None, included_subreddits=None, topN=500): - return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', + return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', outfile, 'author', min_df, From 541e125b28dbca5c06d2160a5cd59ce112657b2a Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 11 Aug 2021 22:48:33 -0700 Subject: [PATCH 14/22] lsi support for weekly similarities --- clustering/hdbscan_clustering.py | 6 +- clustering/hdbscan_clustering_lsi.py | 6 +- clustering/lsi_base.py | 3 +- clustering/pick_best_clustering.py | 13 ++-- similarities/similarities_helper.py | 8 ++- similarities/tfidf.py | 22 ++++--- similarities/weekly_cosine_similarities.py | 75 +++++++++++++++++----- 7 files changed, 95 insertions(+), 38 deletions(-) mode change 100644 => 100755 clustering/pick_best_clustering.py mode change 100644 => 100755 similarities/weekly_cosine_similarities.py diff --git a/clustering/hdbscan_clustering.py b/clustering/hdbscan_clustering.py index e533808..32cdf95 100644 --- a/clustering/hdbscan_clustering.py +++ b/clustering/hdbscan_clustering.py @@ -18,12 +18,12 @@ def test_select_hdbscan_clustering(): # cluster_selection_epsilons=[0,0.05,0.1,0.15], # cluster_selection_methods=['eom','leaf'], # lsi_dimensions='all') - inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/" + inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI" outpath = "test_hdbscan"; min_cluster_sizes=[2,3,4]; min_samples=[1,2,3]; cluster_selection_epsilons=[0,0.1,0.3,0.5]; - cluster_selection_methods=['eom']; + cluster_selection_methods=[1]; lsi_dimensions='all' gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods) gs.run(20) @@ -120,7 +120,7 @@ def run_hdbscan_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], mi map(int,min_cluster_sizes), map(int,min_samples), map(float,cluster_selection_epsilons), - map(float,cluster_selection_methods)) + cluster_selection_methods) obj.run() obj.save(savefile) diff --git a/clustering/hdbscan_clustering_lsi.py b/clustering/hdbscan_clustering_lsi.py index cbd44bd..a4c1efd 100644 --- a/clustering/hdbscan_clustering_lsi.py +++ b/clustering/hdbscan_clustering_lsi.py @@ -67,7 +67,7 @@ class _hdbscan_lsi_grid_sweep(grid_sweep): s += f"_lsi-{self.lsi_dim}" return s -def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'],lsi_dimensions='all'): +def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=[1],lsi_dimensions='all'): """Run hdbscan clustering once or more with different parameters. Usage: @@ -90,8 +90,8 @@ def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, min_cluster_sizes=[2] list(map(int,min_cluster_sizes)), list(map(int,min_samples)), list(map(float,cluster_selection_epsilons)), - cluster_selection_methods - ) + cluster_selection_methods) + obj.run(10) obj.save(savefile) diff --git a/clustering/lsi_base.py b/clustering/lsi_base.py index f07bca6..80b7101 100644 --- a/clustering/lsi_base.py +++ b/clustering/lsi_base.py @@ -18,10 +18,11 @@ class lsi_grid_sweep(grid_sweep): self.subsweep = subsweep inpath = Path(inpath) if lsi_dimensions == 'all': - lsi_paths = list(inpath.glob("*")) + lsi_paths = list(inpath.glob("*.feather")) else: lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions] + print(lsi_paths) lsi_nums = [int(p.stem) for p in lsi_paths] self.hasrun = False self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] diff --git a/clustering/pick_best_clustering.py b/clustering/pick_best_clustering.py old mode 100644 new mode 100755 index c541d23..e05e3ac --- a/clustering/pick_best_clustering.py +++ b/clustering/pick_best_clustering.py @@ -1,11 +1,12 @@ +#!/usr/bin/env python3 import fire import pandas as pd from pathlib import Path import shutil -selection_data="/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv" +selection_data="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/clustering/comment_authors_compex_LSI/selection_data.csv" outpath = 'test_best.feather' -min_clusters=50; max_isolates=5000; min_cluster_size=2 +min_clusters=50; max_isolates=7500; min_cluster_size=2 # pick the best clustering according to silhouette score subject to contraints def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size): @@ -18,11 +19,15 @@ def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min df.loc[df.n_isolates_0,'n_isolates'] = 0 df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l)) - best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)].iloc[df.shape[1]] + best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)] + best_cluster = best_cluster.iloc[0] + + best_lsi_dimensions = best_cluster.lsi_dimensions print(best_cluster.to_dict()) best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather") shutil.copy(best_path,output) - + print(f"lsi dimensions:{best_lsi_dimensions}") + if __name__ == "__main__": fire.Fire(pick_best_clustering) diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index 13845d1..d97e519 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -97,6 +97,7 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} + print(projection) df = tfidf_ds.to_table(filter=ds_filter,columns=projection) @@ -240,7 +241,6 @@ def test_lsi_sims(): def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None): # first compute the lsi of the matrix # then take the column similarities - print("running LSI",flush=True) if type(n_components) is int: n_components = [n_components] @@ -249,10 +249,14 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 svd_components = n_components[0] - if lsi_model_load is not None: + if lsi_model_load is not None and Path(lsi_model_load).exists(): + print("loading LSI") mod = pickle.load(open(lsi_model_load ,'rb')) + lsi_model_save = lsi_model_load else: + print("running LSI",flush=True) + svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) mod = svd.fit(tfidfmat.T) diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 19d3013..01b0b20 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -4,7 +4,7 @@ from pyspark.sql import functions as f from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits): - spark = SparkSession.builder.getOrCreate() + spark = SparkSession.builder.getOrCreate()y df = spark.read.parquet(inpath) @@ -26,11 +26,12 @@ def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits): def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits): return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) -def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', +def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', topN=None, included_subreddits=None): - return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + return tfidf(inpath, outpath, topN, 'author', @@ -38,11 +39,12 @@ def tfidf_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comm included_subreddits=included_subreddits ) -def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', +def tfidf_terms(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", + outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', topN=None, included_subreddits=None): - return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", + return tfidf(inpath, outpath, topN, 'term', @@ -50,11 +52,12 @@ def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/commen included_subreddits=included_subreddits ) -def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', +def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', topN=None, included_subreddits=None): - return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + return tfidf_weekly(inpath, outpath, topN, 'author', @@ -62,12 +65,13 @@ def tfidf_authors_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfi included_subreddits=included_subreddits ) -def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', +def tfidf_terms_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", + outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', topN=None, included_subreddits=None): - return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", + return tfidf_weekly(inpath, outpath, topN, 'term', diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py old mode 100644 new mode 100755 index 7cafcb9..6ce30b8 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from pyspark.sql import functions as f from pyspark.sql import SparkSession from pyspark.sql import Window @@ -8,17 +9,18 @@ import pandas as pd import fire from itertools import islice, chain from pathlib import Path -from similarities_helper import pull_tfidf, column_similarities, write_weekly_similarities +from similarities_helper import pull_tfidf, column_similarities, write_weekly_similarities, lsi_column_similarities from scipy.sparse import csr_matrix from multiprocessing import Pool, cpu_count from functools import partial -# infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet" -# tfidf_path = infile -# min_df=None -# max_df = None -# topN=100 -# term_colname='author' +infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_10k.parquet" +tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet" +min_df=None +included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" +max_df = None +topN=100 +term_colname='author' # outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet' # included_subreddits=None @@ -34,7 +36,7 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, - week=week.isoformat(), + week=week, rescale_idf=False) tfidf_colname='tf_idf' @@ -42,7 +44,7 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)),shape=(nterms,subreddit_names.shape[0])) print('computing similarities') - sims = simfunc(mat.T) + sims = simfunc(mat) del mat sims = pd.DataFrame(sims) sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) @@ -53,14 +55,28 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, def pull_weeks(batch): return set(batch.to_pandas()['week']) +# This requires a prefit LSI model, since we shouldn't fit different LSI models for every week. +def cosine_similarities_weekly_lsi(n_components=100, lsi_model=None, *args, **kwargs): + term_colname= kwargs.get('term_colname') + #lsi_model = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI/1000_term_LSIMOD.pkl" + + # simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm='randomized',lsi_model_load=lsi_model) + + simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=kwargs.get('n_iter'),random_state=kwargs.get('random_state'),algorithm=kwargs.get('algorithm'),lsi_model_load=lsi_model) + + return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) + #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet') -def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500): +def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500, simfunc=column_similarities): print(outfile) # do this step in parallel if we have the memory for it. # should be doable with pool.map spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(tfidf_path) + + # load subreddits + topN + subreddit_names = df.select(['subreddit','subreddit_id']).distinct().toPandas() subreddit_names = subreddit_names.sort_values("subreddit_id") nterms = df.select(f.max(f.col(term_colname + "_id")).alias('max')).collect()[0].max @@ -68,7 +84,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, spark.stop() print(f"computing weekly similarities") - week_similarities_helper = partial(_week_similarities,simfunc=column_similarities, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN, subreddit_names=subreddit_names,nterms=nterms) + week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN, subreddit_names=subreddit_names,nterms=nterms) pool = Pool(cpu_count()) @@ -77,8 +93,8 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, # with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine? -def author_cosine_similarities_weekly(outfile, min_df=2, max_df=None, included_subreddits=None, topN=500): - return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', +def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500): + return cosine_similarities_weekly(infile, outfile, 'author', min_df, @@ -86,8 +102,8 @@ def author_cosine_similarities_weekly(outfile, min_df=2, max_df=None, included_s included_subreddits, topN) -def term_cosine_similarities_weekly(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500): - return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', +def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None): + return cosine_similarities_weekly(infile, outfile, 'term', min_df, @@ -95,6 +111,33 @@ def term_cosine_similarities_weekly(outfile, min_df=None, max_df=None, included_ included_subreddits, topN) + +def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=None,n_components=100,lsi_model=None): + return cosine_similarities_weekly_lsi(infile, + outfile, + 'author', + min_df, + max_df, + included_subreddits, + topN, + n_components=n_components, + lsi_model=lsi_model) + + +def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500,n_components=100,lsi_model=None): + return cosine_similarities_weekly_lsi(infile, + outfile, + 'term', + min_df, + max_df, + included_subreddits, + topN, + n_components=n_components, + lsi_model=lsi_model) + if __name__ == "__main__": fire.Fire({'authors':author_cosine_similarities_weekly, - 'terms':term_cosine_similarities_weekly}) + 'terms':term_cosine_similarities_weekly, + 'authors-lsi':author_cosine_similarities_weekly_lsi, + 'terms-lsi':term_cosine_similarities_weekly + }) From 98c1317af5da5aafd1e7acb31911ca4333312571 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Fri, 10 Dec 2021 21:23:32 -0800 Subject: [PATCH 15/22] update pushshift dumps. --- dumps/check_comments_shas.py | 2 +- dumps/pull_pushshift_comments.sh | 8 ++++---- dumps/pull_pushshift_submissions.sh | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dumps/check_comments_shas.py b/dumps/check_comments_shas.py index 199261c..dd428be 100755 --- a/dumps/check_comments_shas.py +++ b/dumps/check_comments_shas.py @@ -6,7 +6,7 @@ from os import path import hashlib shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text -shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text +#shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text shasums = shasums1 + shasums2 dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" diff --git a/dumps/pull_pushshift_comments.sh b/dumps/pull_pushshift_comments.sh index 40d82d8..ab309dd 100755 --- a/dumps/pull_pushshift_comments.sh +++ b/dumps/pull_pushshift_comments.sh @@ -1,12 +1,12 @@ #!/bin/bash -user_agent='nathante teblunthuis ' +user_agent='"nathante teblunthuis "' output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments' base_url='https://files.pushshift.io/reddit/comments/' -wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url ./check_comments_shas.py diff --git a/dumps/pull_pushshift_submissions.sh b/dumps/pull_pushshift_submissions.sh index 99d89be..36e6473 100755 --- a/dumps/pull_pushshift_submissions.sh +++ b/dumps/pull_pushshift_submissions.sh @@ -1,14 +1,14 @@ #!/bin/bash -user_agent='nathante teblunthuis ' +user_agent='"nathante teblunthuis "' output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions' base_url='https://files.pushshift.io/reddit/submissions/' -wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url -wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ -wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ -wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ +wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url +wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ +wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ +wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/ ./check_submission_shas.py From 7b130a30af863dfa727d80d9fea23648dcc9d5d8 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 19 Jan 2022 13:57:02 -0800 Subject: [PATCH 16/22] commit changes from smap project. --- density/overlap_density.py | 6 +- similarities/lsi_similarities.py | 34 ++++----- similarities/similarities_helper.py | 88 +++++++++++----------- similarities/tfidf.py | 54 +++++++++---- similarities/weekly_cosine_similarities.py | 86 +++++++++++---------- timeseries/cluster_timeseries.py | 14 ++-- 6 files changed, 160 insertions(+), 122 deletions(-) diff --git a/density/overlap_density.py b/density/overlap_density.py index 2036824..ef0eb26 100644 --- a/density/overlap_density.py +++ b/density/overlap_density.py @@ -4,9 +4,9 @@ from pathlib import Path import fire import numpy as np import sys -sys.path.append("..") -sys.path.append("../similarities") -from similarities.similarities_helper import reindex_tfidf +# sys.path.append("..") +# sys.path.append("../similarities") +# from similarities.similarities_helper import pull_tfidf # this is the mean of the ratio of the overlap to the focal size. # mean shared membership per focal community member diff --git a/similarities/lsi_similarities.py b/similarities/lsi_similarities.py index eb89f55..565e53f 100644 --- a/similarities/lsi_similarities.py +++ b/similarities/lsi_similarities.py @@ -5,28 +5,28 @@ from similarities_helper import * #from similarities_helper import similarities, lsi_column_similarities from functools import partial -inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_terms_compex.parquet/" -term_colname='term' -outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI' -n_components=[10,50,100] -included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" -n_iter=5 -random_state=1968 -algorithm='arpack' -topN = None -from_date=None -to_date=None -min_df=None -max_df=None +# inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet" +# term_colname='authors' +# outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_test_compex_LSI' +# n_components=[10,50,100] +# included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" +# n_iter=5 +# random_state=1968 +# algorithm='randomized' +# topN = None +# from_date=None +# to_date=None +# min_df=None +# max_df=None + def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack',lsi_model=None): print(n_components,flush=True) - if lsi_model is None: if type(n_components) == list: - lsi_model = Path(outfile) / f'{max(n_components)}_{term_colname}_LSIMOD.pkl' + lsi_model = Path(outfile) / f'{max(n_components)}_{term_colname}s_LSIMOD.pkl' else: - lsi_model = Path(outfile) / f'{n_components}_{term_colname}_LSIMOD.pkl' + lsi_model = Path(outfile) / f'{n_components}_{term_colname}s_LSIMOD.pkl' simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model) @@ -62,7 +62,7 @@ def author_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/t n_components=n_components ) -def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968): +def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968): return lsi_similarities(inpath, 'author', outfile, diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index d97e519..202220c 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -43,7 +43,7 @@ def reindex_tfidf(*args, **kwargs): new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() new_ids = new_ids.set_index('subreddit_id') subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() - subreddit_names = subreddit_names.drop("subreddit_id",1) + subreddit_names = subreddit_names.drop("subreddit_id",axis=1) subreddit_names = subreddit_names.sort_values("subreddit_id_new") return(df, subreddit_names) @@ -51,8 +51,9 @@ def pull_tfidf(*args, **kwargs): df, _, _ = _pull_or_reindex_tfidf(*args, **kwargs, reindex=False) return df -def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True): - print(f"loading tfidf {infile}", flush=True) +def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=None, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True): + print(f"loading tfidf {infile}, week {week}, min_df {min_df}, max_df {max_df}", flush=True) + if week is not None: tfidf_ds = ds.dataset(infile, partitioning='hive') else: @@ -94,23 +95,23 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu projection = { 'subreddit_id':ds.field('subreddit_id'), term_id:ds.field(term_id), - 'relative_tf':ds.field('relative_tf').cast('float32'), 'tf_idf':ds.field('tf_idf').cast('float32')} - print(projection) - + print(projection, flush=True) + print(ds_filter, flush=True) df = tfidf_ds.to_table(filter=ds_filter,columns=projection) df = df.to_pandas(split_blocks=True,self_destruct=True) - print("assigning indexes",flush=True) + if reindex: - df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + print("assigning indexes",flush=True) + df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + 1 else: df['subreddit_id_new'] = df['subreddit_id'] if reindex: grouped = df.groupby(term_id) - df[term_id_new] = grouped.ngroup() + df[term_id_new] = grouped.ngroup() + 1 else: df[term_id_new] = df[term_id] @@ -126,17 +127,17 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu return (df, tfidf_ds, ds_filter) - with Pool(cpu_count()) as pool: - chunks = pool.imap_unordered(pull_names,batches) - subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() + # with Pool(cpu_count()) as pool: + # chunks = pool.imap_unordered(pull_names,batches) + # subreddit_names = pd.concat(chunks,copy=False).drop_duplicates() - subreddit_names = subreddit_names.set_index("subreddit_id") - new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() - new_ids = new_ids.set_index('subreddit_id') - subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() - subreddit_names = subreddit_names.drop("subreddit_id",1) - subreddit_names = subreddit_names.sort_values("subreddit_id_new") - return(df, subreddit_names) + # subreddit_names = subreddit_names.set_index("subreddit_id") + # new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates() + # new_ids = new_ids.set_index('subreddit_id') + # subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index() + # subreddit_names = subreddit_names.drop("subreddit_id",1) + # subreddit_names = subreddit_names.sort_values("subreddit_id_new") + # return(df, subreddit_names) def pull_names(batch): return(batch.to_pandas().drop_duplicates()) @@ -170,7 +171,7 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non term_id_new = term + '_id_new' entries, subreddit_names = reindex_tfidf(inpath, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date) - mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new))) + mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1))) print("loading matrix") @@ -238,7 +239,8 @@ def test_lsi_sims(): # if n_components is a list we'll return a list of similarities with different latent dimensionalities # if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations. # this function takes the svd and then the column similarities of it -def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None): +# lsi_model_load = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI/1000_term_LSIMOD.pkl" +def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model=None): # first compute the lsi of the matrix # then take the column similarities @@ -249,28 +251,24 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 svd_components = n_components[0] - if lsi_model_load is not None and Path(lsi_model_load).exists(): - print("loading LSI") - mod = pickle.load(open(lsi_model_load ,'rb')) - lsi_model_save = lsi_model_load - - else: + if lsi_model is None: print("running LSI",flush=True) - svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter) mod = svd.fit(tfidfmat.T) + else: + mod = lsi_model lsimat = mod.transform(tfidfmat.T) if lsi_model_save is not None: + Path(lsi_model_save).parent.mkdir(exist_ok=True,parents=True) pickle.dump(mod, open(lsi_model_save,'wb')) - sims_list = [] + print(n_components) for n_dims in n_components: + print("computing similarities") sims = column_similarities(lsimat[:,np.arange(n_dims)]) - if len(n_components) > 1: - yield (sims, n_dims) - else: - return sims + yield (sims, n_dims) + def column_similarities(mat): @@ -326,11 +324,11 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig else: # tf_fam = tf_weight.Norm05 df = df.withColumn("tf_idf", (0.5 + 0.5 * df.relative_tf) * df.idf) - df = df.repartition(400,'subreddit','week') + df = df.repartition('week') dfwriter = df.write.partitionBy("week") return dfwriter -def _calc_tfidf(df, term_colname, tf_family): +def _calc_tfidf(df, term_colname, tf_family, min_df=None, max_df=None): term = term_colname term_id = term + '_id' @@ -348,7 +346,13 @@ def _calc_tfidf(df, term_colname, tf_family): idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1) # collect the dictionary to make a pydict of terms to indexes - terms = idf.select(term).distinct() # terms are distinct + terms = idf + if min_df is not None: + terms = terms.filter(f.col('count')>=min_df) + if max_df is not None: + terms = terms.filter(f.col('count')<=max_df) + + terms = terms.select(term).distinct() # terms are distinct terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct # make subreddit ids @@ -358,12 +362,12 @@ def _calc_tfidf(df, term_colname, tf_family): df = df.join(subreddits,on='subreddit') # map terms to indexes in the tfs and the idfs - df = df.join(terms,on=term) # subreddit-term-id is unique + df = df.join(terms,on=term,how='inner') # subreddit-term-id is unique - idf = idf.join(terms,on=term) + idf = idf.join(terms,on=term,how='inner') # join on subreddit/term to create tf/dfs indexed by term - df = df.join(idf, on=[term_id, term]) + df = df.join(idf, on=[term_id, term],how='inner') # agg terms by subreddit to make sparse tf/df vectors if tf_family == tf_weight.MaxTF: @@ -374,14 +378,14 @@ def _calc_tfidf(df, term_colname, tf_family): return df -def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): +def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, min_df=None, max_df=None): term = term_colname term_id = term + '_id' - # aggregate counts by week. now subreddit-term is distinct + df = df.filter(df.subreddit.isin(include_subs)) df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf')) - df = _calc_tfidf(df, term_colname, tf_family) + df = _calc_tfidf(df, term_colname, tf_family, min_df, max_df) df = df.repartition('subreddit') dfwriter = df.write return dfwriter diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 01b0b20..3356299 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -2,9 +2,12 @@ import fire from pyspark.sql import SparkSession from pyspark.sql import functions as f from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits +from functools import partial -def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits): - spark = SparkSession.builder.getOrCreate()y +inpath = '/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet' +# include_terms is a path to a parquet file that contains a column of term_colname + '_id' to include. +def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=None, min_df=None, max_df=None): + spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(inpath) @@ -15,50 +18,72 @@ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_ else: include_subs = select_topN_subreddits(topN) - dfwriter = func(df, include_subs, term_colname) + include_subs = spark.sparkContext.broadcast(include_subs) + + # term_id = term_colname + "_id" + + if included_terms is not None: + terms_df = spark.read.parquet(included_terms) + terms_df = terms_df.select(term_colname).distinct() + df = df.join(terms_df, on=term_colname, how='left_semi') + + dfwriter = func(df, include_subs.value, term_colname) dfwriter.parquet(outpath,mode='overwrite',compression='snappy') spark.stop() -def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits): - return _tfidf_wrapper(tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) +def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits, min_df, max_df): + tfidf_func = partial(tfidf_dataset, max_df=max_df, min_df=min_df) + return _tfidf_wrapper(tfidf_func, inpath, outpath, topN, term_colname, exclude, included_subreddits) + +def tfidf_weekly(inpath, outpath, static_tfidf_path, topN, term_colname, exclude, included_subreddits): + return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=static_tfidf_path) -def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits): - return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits) def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet', topN=None, - included_subreddits=None): + included_subreddits=None, + min_df=None, + max_df=None): return tfidf(inpath, outpath, topN, 'author', ['[deleted]','AutoModerator'], - included_subreddits=included_subreddits + included_subreddits=included_subreddits, + min_df=min_df, + max_df=max_df ) def tfidf_terms(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet', topN=None, - included_subreddits=None): + included_subreddits=None, + min_df=None, + max_df=None): return tfidf(inpath, outpath, topN, 'term', [], - included_subreddits=included_subreddits + included_subreddits=included_subreddits, + min_df=min_df, + max_df=max_df ) def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet", + static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet", outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', topN=None, - included_subreddits=None): + included_subreddits=None + ): return tfidf_weekly(inpath, outpath, + static_tfidf_path, topN, 'author', ['[deleted]','AutoModerator'], @@ -66,13 +91,16 @@ def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_ ) def tfidf_terms_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet", + static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet", outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', topN=None, - included_subreddits=None): + included_subreddits=None + ): return tfidf_weekly(inpath, outpath, + static_tfidf_path, topN, 'term', [], diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index 6ce30b8..45327c7 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -13,18 +13,23 @@ from similarities_helper import pull_tfidf, column_similarities, write_weekly_si from scipy.sparse import csr_matrix from multiprocessing import Pool, cpu_count from functools import partial +import pickle -infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_10k.parquet" -tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet" -min_df=None -included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" -max_df = None -topN=100 -term_colname='author' -# outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet' -# included_subreddits=None +# tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity_weekly/comment_authors_tfidf.parquet" +# #tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data//comment_authors_compex.parquet" +# min_df=2 +# included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" +# max_df = None +# topN=100 +# term_colname='author' +# # outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet' +# # included_subreddits=None +outfile="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity_weekly/comment_authors.parquet"; infile="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_authors_tfidf.parquet"; included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"; lsi_model="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI/2000_authors_LSIMOD.pkl"; n_components=1500; algorithm="randomized"; term_colname='author'; tfidf_path=infile; random_state=1968; -def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, included_subreddits, topN, outdir:Path, subreddit_names, nterms): +# static_tfidf = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet" +# dftest = spark.read.parquet(static_tfidf) + +def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subreddits, outdir:Path, subreddit_names, nterms, topN=None, min_df=None, max_df=None): term = term_colname term_id = term + '_id' term_id_new = term + '_id_new' @@ -32,20 +37,19 @@ def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, entries = pull_tfidf(infile = tfidf_path, term_colname=term_colname, - min_df=min_df, - max_df=max_df, included_subreddits=included_subreddits, topN=topN, - week=week, + week=week.isoformat(), rescale_idf=False) tfidf_colname='tf_idf' # if the max subreddit id we found is less than the number of subreddit names then we have to fill in 0s mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)),shape=(nterms,subreddit_names.shape[0])) - print('computing similarities') + print(simfunc) sims = simfunc(mat) del mat + sims = next(sims)[0] sims = pd.DataFrame(sims) sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) sims['_subreddit'] = subreddit_names.subreddit.values @@ -56,18 +60,20 @@ def pull_weeks(batch): return set(batch.to_pandas()['week']) # This requires a prefit LSI model, since we shouldn't fit different LSI models for every week. -def cosine_similarities_weekly_lsi(n_components=100, lsi_model=None, *args, **kwargs): +def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kwargs): + print(args) + print(kwargs) term_colname= kwargs.get('term_colname') - #lsi_model = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI/1000_term_LSIMOD.pkl" + # lsi_model = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI/1000_author_LSIMOD.pkl" - # simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm='randomized',lsi_model_load=lsi_model) - - simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=kwargs.get('n_iter'),random_state=kwargs.get('random_state'),algorithm=kwargs.get('algorithm'),lsi_model_load=lsi_model) + lsi_model = pickle.load(open(lsi_model,'rb')) + #simfunc = partial(lsi_column_similarities,n_components=n_components,random_state=random_state,algorithm='randomized',lsi_model=lsi_model) + simfunc = partial(lsi_column_similarities,n_components=n_components,random_state=kwargs.get('random_state'),lsi_model=lsi_model) return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet') -def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500, simfunc=column_similarities): +def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None,max_df=None): print(outfile) # do this step in parallel if we have the memory for it. # should be doable with pool.map @@ -84,12 +90,14 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, spark.stop() print(f"computing weekly similarities") - week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN, subreddit_names=subreddit_names,nterms=nterms) + week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=None, subreddit_names=subreddit_names,nterms=nterms) - pool = Pool(cpu_count()) - - list(pool.imap(week_similarities_helper,weeks)) - pool.close() + for week in weeks: + week_similarities_helper(week) + # pool = Pool(cpu_count()) + + # list(pool.imap(week_similarities_helper, weeks)) + # pool.close() # with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine? @@ -97,10 +105,11 @@ def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/ return cosine_similarities_weekly(infile, outfile, 'author', - min_df, max_df, included_subreddits, - topN) + topN, + min_df=2 +) def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None): return cosine_similarities_weekly(infile, @@ -112,32 +121,29 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re topN) -def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=None,n_components=100,lsi_model=None): +def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None): return cosine_similarities_weekly_lsi(infile, outfile, 'author', - min_df, - max_df, - included_subreddits, - topN, + included_subreddits=included_subreddits, n_components=n_components, - lsi_model=lsi_model) + lsi_model=lsi_model + ) -def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500,n_components=100,lsi_model=None): +def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None): return cosine_similarities_weekly_lsi(infile, outfile, 'term', - min_df, - max_df, - included_subreddits, - topN, + included_subreddits=included_subreddits, n_components=n_components, - lsi_model=lsi_model) + lsi_model=lsi_model, + ) if __name__ == "__main__": fire.Fire({'authors':author_cosine_similarities_weekly, 'terms':term_cosine_similarities_weekly, 'authors-lsi':author_cosine_similarities_weekly_lsi, - 'terms-lsi':term_cosine_similarities_weekly + 'terms-lsi':term_cosine_similarities_weekly_lsi }) + diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py index 91fa705..2286ab0 100644 --- a/timeseries/cluster_timeseries.py +++ b/timeseries/cluster_timeseries.py @@ -12,10 +12,6 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", output="data/subreddit_timeseries.parquet"): - - clusters = load_clusters(term_clusters_path, author_clusters_path) - densities = load_densities(term_densities_path, author_densities_path) - spark = SparkSession.builder.getOrCreate() df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") @@ -26,11 +22,15 @@ def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit ts = df.select(['subreddit','week','author']).distinct().groupby(['subreddit','week']).count() ts = ts.repartition('subreddit') - spk_clusters = spark.createDataFrame(clusters) + + if term_densities_path is not None and author_densities_path is not None: + densities = load_densities(term_densities_path, author_densities_path) + spk_densities = spark.createDataFrame(densities) + ts = ts.join(spk_densities, on='subreddit', how='inner') + clusters = load_clusters(term_clusters_path, author_clusters_path) + spk_clusters = spark.createDataFrame(clusters) ts = ts.join(spk_clusters, on='subreddit', how='inner') - spk_densities = spark.createDataFrame(densities) - ts = ts.join(spk_densities, on='subreddit', how='inner') ts.write.parquet(output, mode='overwrite') if __name__ == "__main__": From 197518a222a321a8027c3dc5a4121350c47d0779 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 6 Apr 2022 11:11:11 -0700 Subject: [PATCH 17/22] git-annex in --- datasets/checkpoint_parallelsql.sbatch | 26 ----- datasets/comments_2_parquet.sh | 4 +- datasets/comments_2_parquet_part1.py | 114 ++++++++++--------- datasets/comments_2_parquet_part2.py | 13 ++- datasets/helper.py | 5 +- datasets/job_script.sh | 4 +- datasets/submissions_2_parquet.sh | 4 +- datasets/submissions_2_parquet_part1.py | 14 +-- dumps/check_comments_shas.py | 2 +- ngrams/run_tf_jobs.sh | 4 +- ngrams/sort_tf_comments.py | 17 ++- ngrams/tf_comments.py | 47 +++++--- ngrams/top_comment_phrases.py | 91 ++++++++------- similarities/Makefile | 122 +++++++++++---------- similarities/job_script.sh | 4 +- similarities/lsi_similarities.py | 29 ++--- similarities/similarities_helper.py | 1 + similarities/tfidf.py | 2 +- similarities/top_subreddits_by_comments.py | 4 +- 19 files changed, 260 insertions(+), 247 deletions(-) delete mode 100644 datasets/checkpoint_parallelsql.sbatch mode change 100644 => 100755 datasets/submissions_2_parquet.sh mode change 100644 => 100755 ngrams/top_comment_phrases.py diff --git a/datasets/checkpoint_parallelsql.sbatch b/datasets/checkpoint_parallelsql.sbatch deleted file mode 100644 index dd61e65..0000000 --- a/datasets/checkpoint_parallelsql.sbatch +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -## parallel_sql_job.sh -#SBATCH --job-name=tf_subreddit_comments -## Allocation Definition -#SBATCH --account=comdata-ckpt -#SBATCH --partition=ckpt -## Resources -## Nodes. This should always be 1 for parallel-sql. -#SBATCH --nodes=1 -## Walltime (12 hours) -#SBATCH --time=12:00:00 -## Memory per node -#SBATCH --mem=32G -#SBATCH --cpus-per-task=4 -#SBATCH --ntasks=1 -#SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit -source ./bin/activate -module load parallel_sql -echo $(which perl) -conda list pyarrow -which python3 -#Put here commands to load other modules (e.g. matlab etc.) -#Below command means that parallel_sql will get tasks from the database -#and run them on the node (in parallel). So a 16 core node will have -#16 tasks running at one time. -parallel-sql --sql -a parallel --exit-on-term --jobs 4 diff --git a/datasets/comments_2_parquet.sh b/datasets/comments_2_parquet.sh index 56ecc4d..d61eb65 100755 --- a/datasets/comments_2_parquet.sh +++ b/datasets/comments_2_parquet.sh @@ -1,10 +1,10 @@ +#!/usr/bin/env bash ## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete -#!/usr/bin/env bash echo "#!/usr/bin/bash" > job_script.sh #echo "source $(pwd)/../bin/activate" >> job_script.sh echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh -srun -p comdata -A comdata --nodes=1 --mem=120G --time=48:00:00 --pty job_script.sh +srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 --pty job_script.sh start_spark_and_run.sh 1 $(pwd)/comments_2_parquet_part2.py diff --git a/datasets/comments_2_parquet_part1.py b/datasets/comments_2_parquet_part1.py index d3c7b7c..6960986 100755 --- a/datasets/comments_2_parquet_part1.py +++ b/datasets/comments_2_parquet_part1.py @@ -1,12 +1,15 @@ #!/usr/bin/env python3 +import os import json from datetime import datetime from multiprocessing import Pool from itertools import islice -from helper import find_dumps, open_fileset +from helper import open_input_file, find_dumps import pandas as pd import pyarrow as pa import pyarrow.parquet as pq +from pathlib import Path +import fire def parse_comment(comment, names= None): if names is None: @@ -46,70 +49,63 @@ def parse_comment(comment, names= None): # conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')]) -dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments/" +def parse_dump(partition): -files = list(find_dumps(dumpdir, base_pattern="RC_20*")) + dumpdir = f"/gscratch/comdata/raw_data/reddit_dumps/comments/{partition}" -pool = Pool(28) + stream = open_input_file(dumpdir) + rows = map(parse_comment, stream) -stream = open_fileset(files) + schema = pa.schema([ + pa.field('id', pa.string(), nullable=True), + pa.field('subreddit', pa.string(), nullable=True), + pa.field('link_id', pa.string(), nullable=True), + pa.field('parent_id', pa.string(), nullable=True), + pa.field('created_utc', pa.timestamp('ms'), nullable=True), + pa.field('author', pa.string(), nullable=True), + pa.field('ups', pa.int64(), nullable=True), + pa.field('downs', pa.int64(), nullable=True), + pa.field('score', pa.int64(), nullable=True), + pa.field('edited', pa.bool_(), nullable=True), + pa.field('time_edited', pa.timestamp('ms'), nullable=True), + pa.field('subreddit_type', pa.string(), nullable=True), + pa.field('subreddit_id', pa.string(), nullable=True), + pa.field('stickied', pa.bool_(), nullable=True), + pa.field('is_submitter', pa.bool_(), nullable=True), + pa.field('body', pa.string(), nullable=True), + pa.field('error', pa.string(), nullable=True), + ]) -N = int(1e4) + p = Path("/gscratch/comdata/output/temp/reddit_comments.parquet") + p.mkdir(exist_ok=True,parents=True) -rows = pool.imap_unordered(parse_comment, stream, chunksize=int(N/28)) + N=10000 + with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet", + schema=schema, + compression='snappy', + flavor='spark') as writer: -schema = pa.schema([ - pa.field('id', pa.string(), nullable=True), - pa.field('subreddit', pa.string(), nullable=True), - pa.field('link_id', pa.string(), nullable=True), - pa.field('parent_id', pa.string(), nullable=True), - pa.field('created_utc', pa.timestamp('ms'), nullable=True), - pa.field('author', pa.string(), nullable=True), - pa.field('ups', pa.int64(), nullable=True), - pa.field('downs', pa.int64(), nullable=True), - pa.field('score', pa.int64(), nullable=True), - pa.field('edited', pa.bool_(), nullable=True), - pa.field('time_edited', pa.timestamp('ms'), nullable=True), - pa.field('subreddit_type', pa.string(), nullable=True), - pa.field('subreddit_id', pa.string(), nullable=True), - pa.field('stickied', pa.bool_(), nullable=True), - pa.field('is_submitter', pa.bool_(), nullable=True), - pa.field('body', pa.string(), nullable=True), - pa.field('error', pa.string(), nullable=True), -]) + while True: + chunk = islice(rows,N) + pddf = pd.DataFrame(chunk, columns=schema.names) + table = pa.Table.from_pandas(pddf,schema=schema) + if table.shape[0] == 0: + break + writer.write_table(table) -from pathlib import Path -p = Path("/gscratch/comdata/output/reddit_comments.parquet_temp2") - -if not p.is_dir(): - if p.exists(): - p.unlink() - p.mkdir() - -else: - list(map(Path.unlink,p.glob('*'))) - -part_size = int(1e7) -part = 1 -n_output = 0 -writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark') - -while True: - if n_output > part_size: - if part > 1: - writer.close() - - part = part + 1 - n_output = 0 - - writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark') - - n_output += N - chunk = islice(rows,N) - pddf = pd.DataFrame(chunk, columns=schema.names) - table = pa.Table.from_pandas(pddf,schema=schema) - if table.shape[0] == 0: - break - writer.write_table(table) + writer.close() +def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/comments", overwrite=True): + files = list(find_dumps(dumpdir,base_pattern="RC_20*.*")) + with open("comments_task_list.sh",'w') as of: + for fpath in files: + partition = os.path.split(fpath)[1] + if (not Path(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True): + of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n') + + +if __name__ == '__main__': + fire.Fire({'parse_dump':parse_dump, + 'gen_task_list':gen_task_list}) + diff --git a/datasets/comments_2_parquet_part2.py b/datasets/comments_2_parquet_part2.py index 0d5cc9e..1031c68 100755 --- a/datasets/comments_2_parquet_part2.py +++ b/datasets/comments_2_parquet_part2.py @@ -2,12 +2,19 @@ # spark script to make sorted, and partitioned parquet files +import pyspark from pyspark.sql import functions as f from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() -df = spark.read.parquet("/gscratch/comdata/output/reddit_comments.parquet_temp2",compression='snappy') +conf = pyspark.SparkConf().setAppName("Reddit submissions to parquet") +conf = conf.set("spark.sql.shuffle.partitions",2000) +conf = conf.set('spark.sql.crossJoin.enabled',"true") +conf = conf.set('spark.debug.maxToStringFields',200) +sc = spark.sparkContext + +df = spark.read.parquet("/gscratch/comdata/output/temp/reddit_comments.parquet",compression='snappy') df = df.withColumn("subreddit_2", f.lower(f.col('subreddit'))) df = df.drop('subreddit') @@ -21,9 +28,9 @@ df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt"))) df = df.repartition('subreddit') df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) -df2.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_subreddit.parquet_new", mode='overwrite', compression='snappy') +df2.write.parquet("/gscratch/scrubbed/comdata/output/reddit_comments_by_subreddit.parquet", mode='overwrite', compression='snappy') df = df.repartition('author') df3 = df.sort(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True) df3 = df3.sortWithinPartitions(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True) -df3.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_author.parquet_new", mode='overwrite',compression='snappy') +df3.write.parquet("/gscratch/scrubbed/comdata/output/reddit_comments_by_author.parquet", mode='overwrite',compression='snappy') diff --git a/datasets/helper.py b/datasets/helper.py index 8f1dfe2..db28628 100644 --- a/datasets/helper.py +++ b/datasets/helper.py @@ -24,8 +24,7 @@ def open_fileset(files): for fh in files: print(fh) lines = open_input_file(fh) - for line in lines: - yield line + yield from lines def open_input_file(input_filename): if re.match(r'.*\.7z$', input_filename): @@ -39,7 +38,7 @@ def open_input_file(input_filename): elif re.match(r'.*\.xz', input_filename): cmd = ["xzcat",'-dk', '-T 20',input_filename] elif re.match(r'.*\.zst',input_filename): - cmd = ['zstd','-dck', input_filename] + cmd = ['/kloneusr/bin/zstd','-dck', input_filename, '--memory=2048MB --stdout'] elif re.match(r'.*\.gz',input_filename): cmd = ['gzip','-dc', input_filename] try: diff --git a/datasets/job_script.sh b/datasets/job_script.sh index d90b618..5b5a7d3 100755 --- a/datasets/job_script.sh +++ b/datasets/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -spark-submit --master spark://$(hostname):18899 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/users/nathante/subreddit_term_similarity_weekly_5000.parquet --topN=5000 -stop-all.sh +singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif spark-submit --master spark://$(hostname):7077 comments_2_parquet_part2.py +singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif stop-all.sh diff --git a/datasets/submissions_2_parquet.sh b/datasets/submissions_2_parquet.sh old mode 100644 new mode 100755 index f133069..81a5753 --- a/datasets/submissions_2_parquet.sh +++ b/datasets/submissions_2_parquet.sh @@ -1,8 +1,8 @@ +#!/usr/bin/env bash ## this should be run manually since we don't have a nice way to wait on parallel_sql jobs -#!/usr/bin/env bash -./parse_submissions.sh +srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 $(pwd)/submissions_2_parquet_part1.py gen_task_list start_spark_and_run.sh 1 $(pwd)/submissions_2_parquet_part2.py diff --git a/datasets/submissions_2_parquet_part1.py b/datasets/submissions_2_parquet_part1.py index 16d1988..77ae09f 100755 --- a/datasets/submissions_2_parquet_part1.py +++ b/datasets/submissions_2_parquet_part1.py @@ -3,26 +3,23 @@ # two stages: # 1. from gz to arrow parquet (this script) # 2. from arrow parquet to spark parquet (submissions_2_parquet_part2.py) - from datetime import datetime -from multiprocessing import Pool +from pathlib import Path from itertools import islice from helper import find_dumps, open_fileset import pandas as pd import pyarrow as pa import pyarrow.parquet as pq -import simdjson import fire import os - -parser = simdjson.Parser() +import json def parse_submission(post, names = None): if names is None: names = ['id','author','subreddit','title','created_utc','permalink','url','domain','score','ups','downs','over_18','has_media','selftext','retrieved_on','num_comments','gilded','edited','time_edited','subreddit_type','subreddit_id','subreddit_subscribers','name','is_self','stickied','quarantine','error'] try: - post = parser.parse(post) + post = json.loads(post) except (ValueError) as e: # print(e) # print(post) @@ -92,8 +89,7 @@ def parse_dump(partition): pa.field('quarantine',pa.bool_(),nullable=True), pa.field('error',pa.string(),nullable=True)]) - if not os.path.exists("/gscratch/comdata/output/temp/reddit_submissions.parquet/"): - os.mkdir("/gscratch/comdata/output/temp/reddit_submissions.parquet/") + Path("/gscratch/comdata/output/temp/reddit_submissions.parquet/").mkdir(exist_ok=True,parents=True) with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_submissions.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer: while True: @@ -108,7 +104,7 @@ def parse_dump(partition): def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/submissions"): files = list(find_dumps(dumpdir,base_pattern="RS_20*.*")) - with open("parse_submissions_task_list",'w') as of: + with open("submissions_task_list.sh",'w') as of: for fpath in files: partition = os.path.split(fpath)[1] of.write(f'python3 submissions_2_parquet_part1.py parse_dump {partition}\n') diff --git a/dumps/check_comments_shas.py b/dumps/check_comments_shas.py index dd428be..e59a7b8 100755 --- a/dumps/check_comments_shas.py +++ b/dumps/check_comments_shas.py @@ -8,7 +8,7 @@ import hashlib shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text #shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text -shasums = shasums1 + shasums2 +shasums = shasums1 dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments" for l in shasums.strip().split('\n'): diff --git a/ngrams/run_tf_jobs.sh b/ngrams/run_tf_jobs.sh index 0e7d5dd..9ff590f 100755 --- a/ngrams/run_tf_jobs.sh +++ b/ngrams/run_tf_jobs.sh @@ -1,8 +1,6 @@ #!/usr/bin/env bash -module load parallel_sql + source ./bin/activate python3 tf_comments.py gen_task_list -psu --del --Y -cat tf_task_list | psu --load for job in $(seq 1 50); do sbatch checkpoint_parallelsql.sbatch; done; diff --git a/ngrams/sort_tf_comments.py b/ngrams/sort_tf_comments.py index abb097e..d9c3e2c 100644 --- a/ngrams/sort_tf_comments.py +++ b/ngrams/sort_tf_comments.py @@ -2,12 +2,17 @@ from pyspark.sql import functions as f from pyspark.sql import SparkSession +import fire -spark = SparkSession.builder.getOrCreate() -df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/") +def main(inparquet, outparquet, colname): + spark = SparkSession.builder.getOrCreate() + df = spark.read.parquet(inparquet) -df = df.repartition(2000,'term') -df = df.sort(['term','week','subreddit']) -df = df.sortWithinPartitions(['term','week','subreddit']) + df = df.repartition(2000,colname) + df = df.sort([colname,'week','subreddit']) + df = df.sortWithinPartitions([colname,'week','subreddit']) -df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy') + df.write.parquet(outparquet,mode='overwrite',compression='snappy') + +if __name__ == '__main__': + fire.Fire(main) diff --git a/ngrams/tf_comments.py b/ngrams/tf_comments.py index a40e5d9..f472eeb 100755 --- a/ngrams/tf_comments.py +++ b/ngrams/tf_comments.py @@ -14,21 +14,29 @@ from nltk.util import ngrams import string from random import random from redditcleaner import clean +from pathlib import Path # compute term frequencies for comments in each subreddit by week -def weekly_tf(partition, mwe_pass = 'first'): - dataset = ds.dataset(f'/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/{partition}', format='parquet') - if not os.path.exists("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/"): - os.mkdir("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/") +def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', input_dir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None): - if not os.path.exists("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/"): - os.mkdir("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/") + dataset = ds.dataset(Path(input_dir)/partition, format='parquet') + outputdir = Path(outputdir) + samppath = outputdir / "reddit_comment_ngrams_10p_sample" + + if not samppath.exists(): + samppath.mkdir(parents=True, exist_ok=True) ngram_output = partition.replace("parquet","txt") + if excluded_users is not None: + excluded_users = set(map(str.strip,open(excluded_users))) + df = df.filter(~ (f.col("author").isin(excluded_users))) + + + ngram_path = samppath / ngram_output if mwe_pass == 'first': - if os.path.exists(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}"): - os.remove(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}") + if ngram_path.exists(): + ngram_path.unlink() batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author']) @@ -62,8 +70,10 @@ def weekly_tf(partition, mwe_pass = 'first'): subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week)) + mwe_path = outputdir / "multiword_expressions.feather" + if mwe_pass != 'first': - mwe_dataset = pd.read_feather(f'/gscratch/comdata/output/reddit_ngrams/multiword_expressions.feather') + mwe_dataset = pd.read_feather(mwe_path) mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False) mwe_phrases = list(mwe_dataset.phrase) mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases] @@ -115,7 +125,7 @@ def weekly_tf(partition, mwe_pass = 'first'): for sentence in sentences: if random() <= 0.1: grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4)))) - with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file: + with open(ngram_path,'a') as gram_file: for ng in grams: gram_file.write(' '.join(ng) + '\n') for token in sentence: @@ -149,8 +159,15 @@ def weekly_tf(partition, mwe_pass = 'first'): outrows = tf_comments(subreddit_weeks) outchunksize = 10000 - - with pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer: + + termtf_outputdir = (outputdir / "comment_terms") + termtf_outputdir.mkdir(parents=True, exist_ok=True) + authortf_outputdir = (outputdir / "comment_authors") + authortf_outputdir.mkdir(parents=True, exist_ok=True) + termtf_path = termtf_outputdir / partition + authortf_path = authortf_outputdir / partition + with pq.ParquetWriter(termtf_path, schema=schema, compression='snappy', flavor='spark') as writer, \ + pq.ParquetWriter(authortf_path, schema=author_schema, compression='snappy', flavor='spark') as author_writer: while True: @@ -179,12 +196,12 @@ def weekly_tf(partition, mwe_pass = 'first'): author_writer.close() -def gen_task_list(mwe_pass='first'): +def gen_task_list(mwe_pass='first', outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None): files = os.listdir("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/") - with open("tf_task_list",'w') as outfile: + with open(tf_task_list,'w') as outfile: for f in files: if f.endswith(".parquet"): - outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} {f}\n") + outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n") if __name__ == "__main__": fire.Fire({"gen_task_list":gen_task_list, diff --git a/ngrams/top_comment_phrases.py b/ngrams/top_comment_phrases.py old mode 100644 new mode 100755 index 031cba5..ff1c4f0 --- a/ngrams/top_comment_phrases.py +++ b/ngrams/top_comment_phrases.py @@ -1,58 +1,69 @@ +#!/usr/bin/env python3 from pyspark.sql import functions as f from pyspark.sql import Window from pyspark.sql import SparkSession import numpy as np - -spark = SparkSession.builder.getOrCreate() -df = spark.read.text("/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/") - -df = df.withColumnRenamed("value","phrase") - -# count phrase occurrances -phrases = df.groupby('phrase').count() -phrases = phrases.withColumnRenamed('count','phraseCount') -phrases = phrases.filter(phrases.phraseCount > 10) +import fire +from pathlib import Path -# count overall -N = phrases.select(f.sum(phrases.phraseCount).alias("phraseCount")).collect()[0].phraseCount +def main(ngram_dir="/gscratch/comdata/output/reddit_ngrams"): + spark = SparkSession.builder.getOrCreate() + ngram_dir = Path(ngram_dir) + ngram_sample = ngram_dir / "reddit_comment_ngrams_10p_sample" + df = spark.read.text(str(ngram_sample)) -print(f'analyzing PMI on a sample of {N} phrases') -logN = np.log(N) -phrases = phrases.withColumn("phraseLogProb", f.log(f.col("phraseCount")) - logN) + df = df.withColumnRenamed("value","phrase") -# count term occurrances -phrases = phrases.withColumn('terms',f.split(f.col('phrase'),' ')) -terms = phrases.select(['phrase','phraseCount','phraseLogProb',f.explode(phrases.terms).alias('term')]) + # count phrase occurrances + phrases = df.groupby('phrase').count() + phrases = phrases.withColumnRenamed('count','phraseCount') + phrases = phrases.filter(phrases.phraseCount > 10) -win = Window.partitionBy('term') -terms = terms.withColumn('termCount',f.sum('phraseCount').over(win)) -terms = terms.withColumnRenamed('count','termCount') -terms = terms.withColumn('termLogProb',f.log(f.col('termCount')) - logN) + # count overall + N = phrases.select(f.sum(phrases.phraseCount).alias("phraseCount")).collect()[0].phraseCount -terms = terms.groupBy(terms.phrase, terms.phraseLogProb, terms.phraseCount).sum('termLogProb') -terms = terms.withColumnRenamed('sum(termLogProb)','termsLogProb') -terms = terms.withColumn("phrasePWMI", f.col('phraseLogProb') - f.col('termsLogProb')) + print(f'analyzing PMI on a sample of {N} phrases') + logN = np.log(N) + phrases = phrases.withColumn("phraseLogProb", f.log(f.col("phraseCount")) - logN) -# join phrases to term counts + # count term occurrances + phrases = phrases.withColumn('terms',f.split(f.col('phrase'),' ')) + terms = phrases.select(['phrase','phraseCount','phraseLogProb',f.explode(phrases.terms).alias('term')]) + + win = Window.partitionBy('term') + terms = terms.withColumn('termCount',f.sum('phraseCount').over(win)) + terms = terms.withColumnRenamed('count','termCount') + terms = terms.withColumn('termLogProb',f.log(f.col('termCount')) - logN) + + terms = terms.groupBy(terms.phrase, terms.phraseLogProb, terms.phraseCount).sum('termLogProb') + terms = terms.withColumnRenamed('sum(termLogProb)','termsLogProb') + terms = terms.withColumn("phrasePWMI", f.col('phraseLogProb') - f.col('termsLogProb')) + + # join phrases to term counts -df = terms.select(['phrase','phraseCount','phraseLogProb','phrasePWMI']) + df = terms.select(['phrase','phraseCount','phraseLogProb','phrasePWMI']) -df = df.sort(['phrasePWMI'],descending=True) -df = df.sortWithinPartitions(['phrasePWMI'],descending=True) -df.write.parquet("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet/",mode='overwrite',compression='snappy') + df = df.sort(['phrasePWMI'],descending=True) + df = df.sortWithinPartitions(['phrasePWMI'],descending=True) -df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet/") + pwmi_dir = ngram_dir / "reddit_comment_ngrams_pwmi.parquet/" + df.write.parquet(str(pwmi_dir), mode='overwrite', compression='snappy') -df.write.csv("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.csv/",mode='overwrite',compression='none') + df = spark.read.parquet(str(pwmi_dir)) -df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_comment_ngrams_pwmi.parquet") -df = df.select('phrase','phraseCount','phraseLogProb','phrasePWMI') + df.write.csv(str(ngram_dir / "reddit_comment_ngrams_pwmi.csv/"),mode='overwrite',compression='none') -# choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions. -# -df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3) -df = df.toPandas() -df.to_feather("/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather") -df.to_csv("/gscratch/comdata/users/nathante/reddit_multiword_expressions.csv") + df = spark.read.parquet(str(pwmi_dir)) + df = df.select('phrase','phraseCount','phraseLogProb','phrasePWMI') + + # choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions. + # + df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3) + df = df.toPandas() + df.to_feather(ngram_dir / "multiword_expressions.feather") + df.to_csv(ngram_dir / "multiword_expressions.csv") + +if __name__ == '__main__': + fire.Fire(main) diff --git a/similarities/Makefile b/similarities/Makefile index f578fd5..963192d 100644 --- a/similarities/Makefile +++ b/similarities/Makefile @@ -1,8 +1,10 @@ + #all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet -srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh -srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh -base_data=/gscratch/comdata/output -similarity_data=${base_data}/reddit_similarity +# srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh +# srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh +srun=srun -p compute-bigmem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40 +srun_huge=srun -p compute-hugemem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40 +similarity_data=/gscratch/scrubbed/comdata/reddit_similarity tfidf_data=${similarity_data}/tfidf tfidf_weekly_data=${similarity_data}/tfidf_weekly similarity_weekly_data=${similarity_data}/weekly @@ -10,7 +12,10 @@ lsi_components=[10,50,100,200,300,400,500,600,700,850,1000,1500] lsi_similarities: ${similarity_data}/subreddit_comment_terms_10k_LSI ${similarity_data}/subreddit_comment_authors-tf_10k_LSI ${similarity_data}/subreddit_comment_authors_10k_LSI ${similarity_data}/subreddit_comment_terms_30k_LSI ${similarity_data}/subreddit_comment_authors-tf_30k_LSI ${similarity_data}/subreddit_comment_authors_30k_LSI -all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet + +all: ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather + +#all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet #${tfidf_weekly_data}/comment_terms_100k.parquet ${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet ${similarity_weekly_data}/comment_authors_100k.parquet ${similarity_weekly_data}/comment_terms_30k.parquet ${similarity_weekly_data}/comment_authors_30k.parquet @@ -18,103 +23,106 @@ all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.pa # all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet -${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms.parquet - ${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet +${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_terms.parquet + ${srun} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet ${similarity_data}/subreddit_comment_terms_10k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000 + ${srun} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000 ${similarity_data}/subreddit_comment_terms_10k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200 + ${srun_huge} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200 ${similarity_data}/subreddit_comment_terms_30k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200 + ${srun_huge} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200 --inpath=$< ${similarity_data}/subreddit_comment_terms_30k.feather: ${tfidf_data}/comment_terms_30k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000 + ${srun_huge} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000 --inpath=$< ${similarity_data}/subreddit_comment_authors_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000 + ${srun_huge} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000 --inpath=$< ${similarity_data}/subreddit_comment_authors_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000 + ${srun_huge} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000 --inpath=$< ${similarity_data}/subreddit_comment_authors_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2 + ${srun_huge} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$< ${similarity_data}/subreddit_comment_authors_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2 + ${srun_huge} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=10 --inpath=$< -${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000 +${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000 --inpath=$< -${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000 +${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py + ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000 ${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2 + ${srun_huge} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$< ${similarity_data}/subreddit_comment_authors-tf_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2 + ${srun_huge} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=10 --inpath=$< ${similarity_data}/subreddit_comment_terms_100k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000 + ${srun} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000 ${similarity_data}/subreddit_comment_authors_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000 + ${srun} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000 ${similarity_data}/subreddit_comment_authors-tf_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000 + ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000 -${tfidf_data}/comment_terms_100k.feather/: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 4 tfidf.py terms --topN=100000 --outpath=${tfidf_data}/comment_terms_100k.feather +${similarity_data}/subreddits_by_num_comments_nonsfw.csv: + start_spark_and_run.sh 3 top_subreddits_by_comments.py -${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 4 tfidf.py terms --topN=30000 --outpath=${tfidf_data}/comment_terms_30k.feather +${tfidf_data}/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv +# mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 3 tfidf.py terms --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_terms_100k.parquet -${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 4 tfidf.py terms --topN=10000 --outpath=${tfidf_data}/comment_terms_10k.feather +${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv +# mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 3 tfidf.py terms --topN=30000 --inpath=$< --outpath=${tfidf_data}/comment_terms_30k.feather -${tfidf_data}/comment_authors_100k.feather: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv - mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 4 tfidf.py authors --topN=100000 --outpath=${tfidf_data}/comment_authors_100k.feather +${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv +# mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 3 tfidf.py terms --topN=10000 --inpath=$< --outpath=${tfidf_data}/comment_terms_10k.feather -${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv - mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 4 tfidf.py authors --topN=10000 --outpath=${tfidf_data}/comment_authors_10k.parquet +${tfidf_data}/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv +# mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet -${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv - mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 4 tfidf.py authors --topN=30000 --outpath=${tfidf_data}/comment_authors_30k.parquet +${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv +# mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 3 tfidf.py authors --topN=10000 --inpath=$< --outpath=${tfidf_data}/comment_authors_10k.parquet -${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet +${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv +# mkdir -p ${tfidf_data}/ + start_spark_and_run.sh 3 tfidf.py authors --topN=30000 --inpath=$< --outpath=${tfidf_data}/comment_authors_30k.parquet + +${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv + start_spark_and_run.sh 3 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet ${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_ppnum_comments.csv - start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=100000 --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet + start_spark_and_run.sh 3 tfidf.py authors_weekly --topN=100000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet -${tfidf_weekly_data}/comment_terms_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - start_spark_and_run.sh 2 tfidf.py terms_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet +${tfidf_weekly_data}/comment_terms_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv + start_spark_and_run.sh 2 tfidf.py terms_weekly --topN=30000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet -${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv - start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet +${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv + start_spark_and_run.sh 3 tfidf.py authors_weekly --topN=30000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet: weekly_cosine_similarities.py similarities_helper.py ${tfidf_weekly_data}/comment_terms_100k.parquet - ${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet + ${srun} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_terms_100k.parquet -${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_100k.parquet - ${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet +${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_authors_100k.parquet + ${srun} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet -${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms_30k.parquet - ${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet +${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_terms_30k.parquet + ${srun} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet -${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_30k.parquet - ${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet +,${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_authors_30k.parquet + ${srun} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet -# ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv +# ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv # start_spark_and_run.sh 1 tfidf.py authors_weekly --topN=130000 # /gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet diff --git a/similarities/job_script.sh b/similarities/job_script.sh index 0c37103..1158ff0 100755 --- a/similarities/job_script.sh +++ b/similarities/job_script.sh @@ -1,4 +1,4 @@ #!/usr/bin/bash start_spark_cluster.sh -singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py -singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh +singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif spark-submit --master spark://$(hostname):7077 tfidf.py authors --topN=100000 --inpath=/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet --outpath=/gscratch/scrubbed/comdata/reddit_similarity/tfidf/comment_authors_100k.parquet +singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif stop-all.sh diff --git a/similarities/lsi_similarities.py b/similarities/lsi_similarities.py index eb89f55..493755f 100644 --- a/similarities/lsi_similarities.py +++ b/similarities/lsi_similarities.py @@ -5,19 +5,20 @@ from similarities_helper import * #from similarities_helper import similarities, lsi_column_similarities from functools import partial -inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_terms_compex.parquet/" -term_colname='term' -outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI' -n_components=[10,50,100] -included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" -n_iter=5 -random_state=1968 -algorithm='arpack' -topN = None -from_date=None -to_date=None -min_df=None -max_df=None +# inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_terms_compex.parquet/" +# term_colname='term' +# outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI' +# n_components=[10,50,100] +# included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" +# n_iter=5 +# random_state=1968 +# algorithm='arpack' +# topN = None +# from_date=None +# to_date=None +# min_df=None +# max_df=None + def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack',lsi_model=None): print(n_components,flush=True) @@ -62,7 +63,7 @@ def author_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/t n_components=n_components ) -def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968): +def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968): return lsi_similarities(inpath, 'author', outfile, diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index d97e519..03c10b2 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -262,6 +262,7 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196 lsimat = mod.transform(tfidfmat.T) if lsi_model_save is not None: + Path(lsi_model_save).parent.mkdir(exist_ok=True, parents=True) pickle.dump(mod, open(lsi_model_save,'wb')) sims_list = [] diff --git a/similarities/tfidf.py b/similarities/tfidf.py index 01b0b20..bbae528 100644 --- a/similarities/tfidf.py +++ b/similarities/tfidf.py @@ -4,7 +4,7 @@ from pyspark.sql import functions as f from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits): - spark = SparkSession.builder.getOrCreate()y + spark = SparkSession.builder.getOrCreate() df = spark.read.parquet(inpath) diff --git a/similarities/top_subreddits_by_comments.py b/similarities/top_subreddits_by_comments.py index ff9293c..9a4d7d3 100644 --- a/similarities/top_subreddits_by_comments.py +++ b/similarities/top_subreddits_by_comments.py @@ -17,7 +17,7 @@ df = df.filter(~df.subreddit.like("u_%")) df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments")) df = df.join(prop_nsfw,on='subreddit') -#df = df.filter(df.prop_nsfw < 0.5) +df = df.filter(df.prop_nsfw < 0.5) win = Window.orderBy(f.col('n_comments').desc()) df = df.withColumn('comments_rank', f.rank().over(win)) @@ -26,4 +26,4 @@ df = df.toPandas() df = df.sort_values("n_comments") -df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False) +df.to_csv('/gscratch/scrubbed/comdata/reddit_similarity/subreddits_by_num_comments_nonsfw.csv', index=False) From 5a40465a629a1d7d95dbec9730d3950842bcb4f5 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 8 Jun 2022 17:01:27 -0700 Subject: [PATCH 18/22] add support for umap->hdbscan clustering method --- clustering/Makefile | 21 +- clustering/clustering_base.py | 41 ++++ clustering/grid_sweep.py | 16 ++ clustering/lsi_base.py | 17 +- clustering/umap_hdbscan_clustering.py | 221 ++++++++++++++++++++++ clustering/umap_hdbscan_clustering_lsi.py | 114 +++++++++++ 6 files changed, 428 insertions(+), 2 deletions(-) create mode 100644 clustering/umap_hdbscan_clustering.py create mode 100644 clustering/umap_hdbscan_clustering_lsi.py diff --git a/clustering/Makefile b/clustering/Makefile index 9643f52..2ba9c0c 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -3,6 +3,9 @@ srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activat similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] + +umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] + hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] @@ -91,12 +94,28 @@ ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_inpu ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) +${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv:umap_hdbscan_clustering_lsi.py + $(srun_singularity) python3 umap_hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/umap_hdbscan --savefile=${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv $(umap_hdbscan_selection_grid) + + ${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 ${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 +${authors_tf_10k_output_lsi}/best_umap_hdbscan_2.feather:${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv pick_best_clustering.py + $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 + +best_umap_hdbscan.feather:${authors_tf_10k_output_lsi}/best_umap_hdbscan_2.feather + +# {'lsi_dimensions': 700, 'outpath': '/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/umap_hdbscan', 'silhouette_score': 0.27616957, 'name': 'mcs-2_ms-5_cse-0.05_csm-leaf_nn-15_lr-1.0_md-0.1_lc-1_lsi-700', 'n_clusters': 547, 'n_isolates': 2093, 'silhouette_samples': '/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/umap_hdbscan/silhouette_samples-mcs-2_ms-5_cse-0.05_csm-leaf_nn-15_lr-1.0_md-0.1_lc-1_lsi-700.feather', 'min_cluster_size': 2, 'min_samples': 5, 'cluster_selection_epsilon': 0.05, 'cluster_selection_method': 'leaf', 'n_neighbors': 15, 'learning_rate': 1.0, 'min_dist': 0.1, 'local_connectivity': 1, 'n_isolates_str': '2093', 'n_isolates_0': False} + +best_umap_grid=--min_cluster_sizes=[2] --min_samples=[5] --cluster_selection_epsilons=[0.05] --cluster_selection_methods=[leaf] --n_neighbors=[15] --learning_rate=[1] --min_dist=[0.1] --local_connectivity=[1] --save_step1=True + +umap_hdbscan_coords: + python3 umap_hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/umap_hdbscan --savefile=/dev/null ${best_umap_grid} + clean_affinity: rm -f ${authors_10k_output}/affinity/selection_data.csv rm -f ${authors_tf_10k_output}/affinity/selection_data.csv @@ -159,7 +178,7 @@ clean_lsi_terms: clean: clean_affinity clean_kmeans clean_hdbscan -PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k +PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k best_umap_hdbscan.feather umap_hdbscan_coords # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py # $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py index 3778fc3..ced627d 100644 --- a/clustering/clustering_base.py +++ b/clustering/clustering_base.py @@ -1,3 +1,4 @@ +import pickle from pathlib import Path import numpy as np import pandas as pd @@ -24,6 +25,13 @@ class clustering_job: self.outpath.mkdir(parents=True, exist_ok=True) self.cluster_data.to_feather(self.outpath/(self.name + ".feather")) self.hasrun = True + self.cleanup() + + def cleanup(self): + self.cluster_data = None + self.mat = None + self.clustering=None + self.subreddits=None def get_info(self): if not self.hasrun: @@ -57,6 +65,7 @@ class clustering_job: return score def read_distance_mat(self, similarities, use_threads=True): + print(similarities) df = pd.read_feather(similarities, use_threads=use_threads) mat = np.array(df.drop('_subreddit',1)) n = mat.shape[0] @@ -95,6 +104,38 @@ class clustering_job: return cluster_data +class twoway_clustering_job(clustering_job): + def __init__(self, infile, outpath, name, call1, call2, args1, args2): + self.outpath = Path(outpath) + self.call1 = call1 + self.args1 = args1 + self.call2 = call2 + self.args2 = args2 + self.infile = Path(infile) + self.name = name + self.hasrun = False + self.args = args1|args2 + + def run(self): + self.subreddits, self.mat = self.read_distance_mat(self.infile) + self.step1 = self.call1(self.mat, **self.args1) + self.clustering = self.call2(self.mat, self.step1, **self.args2) + self.cluster_data = self.process_clustering(self.clustering, self.subreddits) + self.hasrun = True + self.after_run() + self.cleanup() + + def after_run(): + self.score = self.silhouette() + self.outpath.mkdir(parents=True, exist_ok=True) + print(self.outpath/(self.name+".feather")) + self.cluster_data.to_feather(self.outpath/(self.name + ".feather")) + + + def cleanup(self): + super().cleanup() + self.step1 = None + @dataclass class clustering_result: outpath:Path diff --git a/clustering/grid_sweep.py b/clustering/grid_sweep.py index c0365d0..f021515 100644 --- a/clustering/grid_sweep.py +++ b/clustering/grid_sweep.py @@ -31,3 +31,19 @@ class grid_sweep: outcsv = Path(outcsv) outcsv.parent.mkdir(parents=True, exist_ok=True) self.infos.to_csv(outcsv) + + +class twoway_grid_sweep(grid_sweep): + def __init__(self, jobtype, inpath, outpath, namer, args1, args2, *args, **kwargs): + self.jobtype = jobtype + self.namer = namer + prod1 = product(* args1.values()) + prod2 = product(* args2.values()) + grid1 = [dict(zip(args1.keys(), pargs)) for pargs in prod1] + grid2 = [dict(zip(args2.keys(), pargs)) for pargs in prod2] + grid = product(grid1, grid2) + inpath = Path(inpath) + outpath = Path(outpath) + self.hasrun = False + self.grid = [(inpath,outpath,namer(**(g[0] | g[1])), g[0], g[1], *args) for g in grid] + self.jobs = [jobtype(*g) for g in self.grid] diff --git a/clustering/lsi_base.py b/clustering/lsi_base.py index 80b7101..14bbfc5 100644 --- a/clustering/lsi_base.py +++ b/clustering/lsi_base.py @@ -1,5 +1,5 @@ from clustering_base import clustering_job, clustering_result -from grid_sweep import grid_sweep +from grid_sweep import grid_sweep, twoway_grid_sweep from dataclasses import dataclass from itertools import chain from pathlib import Path @@ -27,3 +27,18 @@ class lsi_grid_sweep(grid_sweep): self.hasrun = False self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) + +class twoway_lsi_grid_sweep(twoway_grid_sweep): + def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2, save_step1): + self.jobtype = jobtype + self.subsweep = subsweep + inpath = Path(inpath) + if lsi_dimensions == 'all': + lsi_paths = list(inpath.glob("*.feather")) + else: + lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions] + + lsi_nums = [int(p.stem) for p in lsi_paths] + self.hasrun = False + self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2, save_step1) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] + self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) diff --git a/clustering/umap_hdbscan_clustering.py b/clustering/umap_hdbscan_clustering.py new file mode 100644 index 0000000..6a4d2a1 --- /dev/null +++ b/clustering/umap_hdbscan_clustering.py @@ -0,0 +1,221 @@ +from clustering_base import clustering_result, clustering_job, twoway_clustering_job +from hdbscan_clustering import hdbscan_clustering_result +import umap +from grid_sweep import twoway_grid_sweep +from dataclasses import dataclass +import hdbscan +from sklearn.neighbors import NearestNeighbors +import plotnine as pn +import numpy as np +from itertools import product, starmap, chain +import pandas as pd +from multiprocessing import cpu_count +import fire + +def test_select_hdbscan_clustering(): + # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", + # "test_hdbscan_author30k", + # min_cluster_sizes=[2], + # min_samples=[1,2], + # cluster_selection_epsilons=[0,0.05,0.1,0.15], + # cluster_selection_methods=['eom','leaf'], + # lsi_dimensions='all') + inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI" + outpath = "test_umap_hdbscan_lsi" + min_cluster_sizes=[2,3,4] + min_samples=[1,2,3] + cluster_selection_epsilons=[0,0.1,0.3,0.5] + cluster_selection_methods=[1] + lsi_dimensions='all' + n_neighbors = [5,10,15,25,35,70,100] + learning_rate = [0.1,0.5,1,2] + min_dist = [0.5,1,1.5,2] + local_connectivity = [1,2,3,4,5] + + hdbscan_params = {"min_cluster_sizes":min_cluster_sizes, "min_samples":min_samples, "cluster_selection_epsilons":cluster_selection_epsilons, "cluster_selection_methods":cluster_selection_methods} + umap_params = {"n_neighbors":n_neighbors, "learning_rate":learning_rate, "min_dist":min_dist, "local_connectivity":local_connectivity} + gs = umap_hdbscan_grid_sweep(inpath, "all", outpath, hdbscan_params,umap_params) + + # gs.run(20) + # gs.save("test_hdbscan/lsi_sweep.csv") + + + # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom') + # job1.run() + # print(job1.get_info()) + + # df = pd.read_csv("test_hdbscan/selection_data.csv") + # test_select_hdbscan_clustering() + # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") + # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") + # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) +class umap_hdbscan_grid_sweep(twoway_grid_sweep): + def __init__(self, + inpath, + outpath, + umap_params, + hdbscan_params): + + super().__init__(umap_hdbscan_job, inpath, outpath, self.namer, umap_params, hdbscan_params) + + def namer(self, + min_cluster_size, + min_samples, + cluster_selection_epsilon, + cluster_selection_method, + n_neighbors, + learning_rate, + min_dist, + local_connectivity + ): + return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}" + +@dataclass +class umap_hdbscan_clustering_result(hdbscan_clustering_result): + n_neighbors:int + learning_rate:float + min_dist:float + local_connectivity:int + +class umap_hdbscan_job(twoway_clustering_job): + def __init__(self, infile, outpath, name, + umap_args = {"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1}, + hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'}, + save_step1 = False, + *args, + **kwargs): + super().__init__(infile, + outpath, + name, + call1=umap_hdbscan_job._umap_embedding, + call2=umap_hdbscan_job._hdbscan_clustering, + args1=umap_args, + args2=hdbscan_args, + save_step1=save_step1, + *args, + **kwargs + ) + + self.n_neighbors = umap_args['n_neighbors'] + self.learning_rate = umap_args['learning_rate'] + self.min_dist = umap_args['min_dist'] + self.local_connectivity = umap_args['local_connectivity'] + self.min_cluster_size = hdbscan_args['min_cluster_size'] + self.min_samples = hdbscan_args['min_samples'] + self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon'] + self.cluster_selection_method = hdbscan_args['cluster_selection_method'] + + def after_run(self): + coords = self.step1.emedding_ + self.cluster_data['x'] = coords[:,0] + self.cluster_data['y'] = coords[:,1] + super().after_run() + + + def _umap_embedding(mat, **umap_args): + print(f"running umap embedding. umap_args:{umap_args}") + umapmodel = umap.UMAP(metric='precomputed', **umap_args) + umapmodel = umapmodel.fit(mat) + return umapmodel + + def _hdbscan_clustering(mat, umapmodel, **hdbscan_args): + print(f"running hdbascan clustering. hdbscan_args:{hdbscan_args}") + + umap_coords = umapmodel.transform(mat) + + clusterer = hdbscan.HDBSCAN(metric='euclidean', + core_dist_n_jobs=cpu_count(), + **hdbscan_args + ) + + clustering = clusterer.fit(umap_coords) + + return(clustering) + + def get_info(self): + result = super().get_info() + self.result = umap_hdbscan_clustering_result(**result.__dict__, + min_cluster_size=self.min_cluster_size, + min_samples=self.min_samples, + cluster_selection_epsilon=self.cluster_selection_epsilon, + cluster_selection_method=self.cluster_selection_method, + n_neighbors = self.n_neighbors, + learning_rate = self.learning_rate, + min_dist = self.min_dist, + local_connectivity=self.local_connectivity + ) + return self.result + +def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], + min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): + """Run umap + hdbscan clustering once or more with different parameters. + + Usage: + umap_hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_neighbors= --learning_rate= --min_dist= --local_connectivity= --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods= + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to feather data containing a labeled matrix of subreddit similarities. + outpath: path to output fit kmeans clusterings. + n_neighbors: umap parameter takes integers greater than 1 + learning_rate: umap parameter takes positive real values + min_dist: umap parameter takes positive real values + local_connectivity: umap parameter takes positive integers + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_method: "eom" or "leaf" eom gives larger clusters. + """ + + umap_args = {'n_neighbors':list(map(int, n_neighbors)), + 'learning_rate':list(map(float,learning_rate)), + 'min_dist':list(map(float,min_dist)), + 'local_connectivity':list(map(int,local_connectivity)), + } + + hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), + 'min_samples':list(map(int,min_samples)), + 'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)), + 'cluster_selection_method':cluster_selection_methods} + + obj = umap_hdbscan_grid_sweep(inpath, + outpath, + umap_args, + hdbscan_args) + obj.run(cores=10) + obj.save(savefile) + + +def KNN_distances_plot(mat,outname,k=2): + nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) + distances, indices = nbrs.kneighbors(mat) + d2 = distances[:,-1] + df = pd.DataFrame({'dist':d2}) + df = df.sort_values("dist",ascending=False) + df['idx'] = np.arange(0,d2.shape[0]) + 1 + p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50, + breaks = np.arange(0,10)/10) + p.save(outname,width=16,height=10) + +def make_KNN_plots(): + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + + KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png') + + similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather" + subreddits, mat = read_similarity_mat(similarities) + mat = sim_to_dist(mat) + KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') + +if __name__ == "__main__": + fire.Fire(run_umap_hdbscan_grid_sweep) + +# test_select_hdbscan_clustering() + #fire.Fire(select_hdbscan_clustering) diff --git a/clustering/umap_hdbscan_clustering_lsi.py b/clustering/umap_hdbscan_clustering_lsi.py new file mode 100644 index 0000000..09b3630 --- /dev/null +++ b/clustering/umap_hdbscan_clustering_lsi.py @@ -0,0 +1,114 @@ +from umap_hdbscan_clustering import umap_hdbscan_job, umap_hdbscan_grid_sweep, umap_hdbscan_clustering_result +from lsi_base import twoway_lsi_grid_sweep, lsi_mixin, lsi_result_mixin +from grid_sweep import twoway_grid_sweep +import fire +from dataclasses import dataclass + +@dataclass +class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_result_mixin): + pass + +class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin): + def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False): + super().__init__( + infile, + outpath, + name, + umap_args, + hdbscan_args, + save_step1 + ) + super().set_lsi_dims(lsi_dims) + + def get_info(self): + partial_result = super().get_info() + self.result = umap_hdbscan_clustering_result_lsi(**partial_result.__dict__, + lsi_dimensions=self.lsi_dims) + return self.result + +class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): + def __init__(self, + inpath, + lsi_dims, + outpath, + umap_args, + hdbscan_args, + save_step1 + ): + + super().__init__(umap_hdbscan_lsi_job, + _umap_hdbscan_lsi_grid_sweep, + inpath, + lsi_dims, + outpath, + umap_args, + hdbscan_args, + save_step1 + ) + + + +class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): + def __init__(self, + inpath, + outpath, + lsi_dim, + umap_args, + hdbscan_args, + save_step1): + + self.lsi_dim = lsi_dim + self.jobtype = umap_hdbscan_lsi_job + super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim) + + + def namer(self, *args, **kwargs): + s = umap_hdbscan_grid_sweep.namer(self, *args, **kwargs) + s += f"_lsi-{self.lsi_dim}" + return s + +def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], + min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False): + """Run hdbscan clustering once or more with different parameters. + + Usage: + hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + + Keword arguments: + savefile: path to save the metadata and diagnostics + inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. + outpath: path to output fit clusterings. + min_cluster_sizes: one or more integers indicating the minumum cluster size + min_samples: one ore more integers indicating the minimum number of samples used in the algorithm + cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan + cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. + lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. + """ + + + umap_args = {'n_neighbors':list(map(int, n_neighbors)), + 'learning_rate':list(map(float,learning_rate)), + 'min_dist':list(map(float,min_dist)), + 'local_connectivity':list(map(int,local_connectivity)), + } + + hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), + 'min_samples':list(map(int,min_samples)), + 'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)), + 'cluster_selection_method':cluster_selection_methods} + + obj = umap_hdbscan_lsi_grid_sweep(inpath, + lsi_dimensions, + outpath, + umap_args, + hdbscan_args, + save_step1 + ) + + + obj.run(10) + obj.save(savefile) + + +if __name__ == "__main__": + fire.Fire(run_umap_hdbscan_lsi_grid_sweep) From c19079136428953fbfea7f35ad9ffdfc4c574deb Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 8 Jun 2022 17:27:37 -0700 Subject: [PATCH 19/22] add 2 more umap parameters --- clustering/Makefile | 4 ++-- clustering/lsi_base.py | 4 ++-- clustering/umap_hdbscan_clustering.py | 23 ++++++++++++++------- clustering/umap_hdbscan_clustering_lsi.py | 25 +++++++++++------------ 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/clustering/Makefile b/clustering/Makefile index 2ba9c0c..559a85c 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -1,10 +1,10 @@ #srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28' -srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh +srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] -umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] +umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10] hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] diff --git a/clustering/lsi_base.py b/clustering/lsi_base.py index 14bbfc5..84dfa7b 100644 --- a/clustering/lsi_base.py +++ b/clustering/lsi_base.py @@ -29,7 +29,7 @@ class lsi_grid_sweep(grid_sweep): self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) class twoway_lsi_grid_sweep(twoway_grid_sweep): - def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2, save_step1): + def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2): self.jobtype = jobtype self.subsweep = subsweep inpath = Path(inpath) @@ -40,5 +40,5 @@ class twoway_lsi_grid_sweep(twoway_grid_sweep): lsi_nums = [int(p.stem) for p in lsi_paths] self.hasrun = False - self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2, save_step1) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] + self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)] self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids))) diff --git a/clustering/umap_hdbscan_clustering.py b/clustering/umap_hdbscan_clustering.py index 6a4d2a1..5633d77 100644 --- a/clustering/umap_hdbscan_clustering.py +++ b/clustering/umap_hdbscan_clustering.py @@ -63,25 +63,28 @@ class umap_hdbscan_grid_sweep(twoway_grid_sweep): min_samples, cluster_selection_epsilon, cluster_selection_method, + n_components, n_neighbors, learning_rate, min_dist, - local_connectivity + local_connectivity, + densmap ): - return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}" + return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nc-{n_components}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}_dm-{densmap}" @dataclass class umap_hdbscan_clustering_result(hdbscan_clustering_result): + n_components:int n_neighbors:int learning_rate:float min_dist:float local_connectivity:int + densmap:bool class umap_hdbscan_job(twoway_clustering_job): def __init__(self, infile, outpath, name, - umap_args = {"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1}, + umap_args = {"n_components":2,"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1,'densmap':False}, hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'}, - save_step1 = False, *args, **kwargs): super().__init__(infile, @@ -91,15 +94,16 @@ class umap_hdbscan_job(twoway_clustering_job): call2=umap_hdbscan_job._hdbscan_clustering, args1=umap_args, args2=hdbscan_args, - save_step1=save_step1, *args, **kwargs ) + self.n_components = umap_args['n_components'] self.n_neighbors = umap_args['n_neighbors'] self.learning_rate = umap_args['learning_rate'] self.min_dist = umap_args['min_dist'] self.local_connectivity = umap_args['local_connectivity'] + self.densmap = umap_args['densmap'] self.min_cluster_size = hdbscan_args['min_cluster_size'] self.min_samples = hdbscan_args['min_samples'] self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon'] @@ -139,14 +143,17 @@ class umap_hdbscan_job(twoway_clustering_job): min_samples=self.min_samples, cluster_selection_epsilon=self.cluster_selection_epsilon, cluster_selection_method=self.cluster_selection_method, + n_components = self.n_components, n_neighbors = self.n_neighbors, learning_rate = self.learning_rate, min_dist = self.min_dist, - local_connectivity=self.local_connectivity + local_connectivity=self.local_connectivity, + densmap=self.densmap ) return self.result -def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], +def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], + densmap=[False], min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): """Run umap + hdbscan clustering once or more with different parameters. @@ -171,6 +178,8 @@ def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], l 'learning_rate':list(map(float,learning_rate)), 'min_dist':list(map(float,min_dist)), 'local_connectivity':list(map(int,local_connectivity)), + 'n_components':list(map(int, n_components)), + 'densmap':list(map(bool,densmap)) } hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), diff --git a/clustering/umap_hdbscan_clustering_lsi.py b/clustering/umap_hdbscan_clustering_lsi.py index 09b3630..3149939 100644 --- a/clustering/umap_hdbscan_clustering_lsi.py +++ b/clustering/umap_hdbscan_clustering_lsi.py @@ -9,14 +9,13 @@ class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_res pass class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin): - def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims, save_step1=False): + def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims): super().__init__( infile, outpath, name, umap_args, - hdbscan_args, - save_step1 + hdbscan_args ) super().set_lsi_dims(lsi_dims) @@ -32,8 +31,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): lsi_dims, outpath, umap_args, - hdbscan_args, - save_step1 + hdbscan_args ): super().__init__(umap_hdbscan_lsi_job, @@ -42,8 +40,7 @@ class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): lsi_dims, outpath, umap_args, - hdbscan_args, - save_step1 + hdbscan_args ) @@ -55,11 +52,11 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): lsi_dim, umap_args, hdbscan_args, - save_step1): + ): self.lsi_dim = lsi_dim self.jobtype = umap_hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, save_step1, lsi_dim) + super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, lsi_dim) def namer(self, *args, **kwargs): @@ -67,8 +64,9 @@ class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): s += f"_lsi-{self.lsi_dim}" return s -def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], learning_rate=[1], min_dist=[1], local_connectivity=[1], - min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all', save_step1 = False): +def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], + densmap=[False], + min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all'): """Run hdbscan clustering once or more with different parameters. Usage: @@ -90,6 +88,8 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15 'learning_rate':list(map(float,learning_rate)), 'min_dist':list(map(float,min_dist)), 'local_connectivity':list(map(int,local_connectivity)), + 'n_components':list(map(int, n_components)), + 'densmap':list(map(bool,densmap)) } hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), @@ -101,8 +101,7 @@ def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15 lsi_dimensions, outpath, umap_args, - hdbscan_args, - save_step1 + hdbscan_args ) From 811a0d87c4d394c2c7849a613f6aec2d81e49138 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Thu, 18 May 2023 10:29:08 -0700 Subject: [PATCH 20/22] changes from dirty branch. --- clustering/Makefile | 2 +- clustering/clustering_base.py | 11 ++++++++--- clustering/umap_hdbscan_clustering.py | 2 +- datasets/comments_2_parquet_part2.py | 19 ++++++++++--------- datasets/job_script.sh | 6 ++++-- datasets/submissions_2_parquet_part1.py | 4 ++-- datasets/submissions_2_parquet_part2.py | 8 ++++---- 7 files changed, 30 insertions(+), 22 deletions(-) diff --git a/clustering/Makefile b/clustering/Makefile index 559a85c..6f25a7d 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -4,7 +4,7 @@ similarity_data=/gscratch/comdata/output/reddit_similarity clustering_data=/gscratch/comdata/output/reddit_clustering kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] -umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10] +umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10,15,25] hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py index ced627d..98a260e 100644 --- a/clustering/clustering_base.py +++ b/clustering/clustering_base.py @@ -21,9 +21,9 @@ class clustering_job: self.subreddits, self.mat = self.read_distance_mat(self.infile) self.clustering = self.call(self.mat, *self.args, **self.kwargs) self.cluster_data = self.process_clustering(self.clustering, self.subreddits) - self.score = self.silhouette() self.outpath.mkdir(parents=True, exist_ok=True) self.cluster_data.to_feather(self.outpath/(self.name + ".feather")) + self.hasrun = True self.cleanup() @@ -62,6 +62,7 @@ class clustering_job: else: score = None self.silsampout = None + return score def read_distance_mat(self, similarities, use_threads=True): @@ -81,9 +82,13 @@ class clustering_job: self.n_clusters = len(set(clusters)) print(f"found {self.n_clusters} clusters") - cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_}) + + self.score = self.silhouette() + print(f"silhouette_score:{self.score}") + + cluster_sizes = cluster_data.groupby("cluster").count().reset_index() print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members") @@ -125,7 +130,7 @@ class twoway_clustering_job(clustering_job): self.after_run() self.cleanup() - def after_run(): + def after_run(self): self.score = self.silhouette() self.outpath.mkdir(parents=True, exist_ok=True) print(self.outpath/(self.name+".feather")) diff --git a/clustering/umap_hdbscan_clustering.py b/clustering/umap_hdbscan_clustering.py index 5633d77..cf4acbb 100644 --- a/clustering/umap_hdbscan_clustering.py +++ b/clustering/umap_hdbscan_clustering.py @@ -110,7 +110,7 @@ class umap_hdbscan_job(twoway_clustering_job): self.cluster_selection_method = hdbscan_args['cluster_selection_method'] def after_run(self): - coords = self.step1.emedding_ + coords = self.step1.embedding_ self.cluster_data['x'] = coords[:,0] self.cluster_data['y'] = coords[:,1] super().after_run() diff --git a/datasets/comments_2_parquet_part2.py b/datasets/comments_2_parquet_part2.py index 1031c68..5b9a131 100755 --- a/datasets/comments_2_parquet_part2.py +++ b/datasets/comments_2_parquet_part2.py @@ -9,7 +9,7 @@ from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() conf = pyspark.SparkConf().setAppName("Reddit submissions to parquet") -conf = conf.set("spark.sql.shuffle.partitions",2000) +conf = conf.set("spark.sql.shuffle.partitions",2400) conf = conf.set('spark.sql.crossJoin.enabled',"true") conf = conf.set('spark.debug.maxToStringFields',200) sc = spark.sparkContext @@ -25,12 +25,13 @@ df = df.withColumn("Month",f.month(f.col("CreatedAt"))) df = df.withColumn("Year",f.year(f.col("CreatedAt"))) df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt"))) -df = df.repartition('subreddit') -df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) -df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) -df2.write.parquet("/gscratch/scrubbed/comdata/output/reddit_comments_by_subreddit.parquet", mode='overwrite', compression='snappy') +# df = df.repartition(1200,'subreddit') +# df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) +# df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True) +# df2.write.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_subreddit.parquet", mode='overwrite', compression='snappy') -df = df.repartition('author') -df3 = df.sort(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True) -df3 = df3.sortWithinPartitions(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True) -df3.write.parquet("/gscratch/scrubbed/comdata/output/reddit_comments_by_author.parquet", mode='overwrite',compression='snappy') +#df = spark.read.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_subreddit.parquet") +df = df.repartition(2400,'author','subreddit',"Year","Month","Day") +df3 = df.sort(["author","subreddit","Year","Month","Day","CreatedAt","link_id","parent_id"],ascending=True) +df3 = df3.sortWithinPartitions(["author","subreddit","Year","Month","Day","CreatedAt","link_id","parent_id"],ascending=True) +df3.write.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_author.parquet", mode='overwrite',compression='snappy') diff --git a/datasets/job_script.sh b/datasets/job_script.sh index 5b5a7d3..ca994d5 100755 --- a/datasets/job_script.sh +++ b/datasets/job_script.sh @@ -1,4 +1,6 @@ #!/usr/bin/bash +source ~/.bashrc +echo $(hostname) start_spark_cluster.sh -singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif spark-submit --master spark://$(hostname):7077 comments_2_parquet_part2.py -singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif stop-all.sh +spark-submit --verbose --master spark://$(hostname):43015 submissions_2_parquet_part2.py +stop-all.sh diff --git a/datasets/submissions_2_parquet_part1.py b/datasets/submissions_2_parquet_part1.py index 77ae09f..d1a8a3d 100755 --- a/datasets/submissions_2_parquet_part1.py +++ b/datasets/submissions_2_parquet_part1.py @@ -58,7 +58,7 @@ def parse_submission(post, names = None): def parse_dump(partition): N=10000 - stream = open_fileset([f"/gscratch/comdata/raw_data/reddit_dumps/submissions/{partition}"]) + stream = open_fileset([f"/gscratch/comdata/raw_data/submissions/{partition}"]) rows = map(parse_submission,stream) schema = pa.schema([ pa.field('id', pa.string(),nullable=True), @@ -102,7 +102,7 @@ def parse_dump(partition): writer.close() -def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/submissions"): +def gen_task_list(dumpdir="/gscratch/comdata/raw_data/submissions"): files = list(find_dumps(dumpdir,base_pattern="RS_20*.*")) with open("submissions_task_list.sh",'w') as of: for fpath in files: diff --git a/datasets/submissions_2_parquet_part2.py b/datasets/submissions_2_parquet_part2.py index 3a58617..7dc4f74 100644 --- a/datasets/submissions_2_parquet_part2.py +++ b/datasets/submissions_2_parquet_part2.py @@ -29,14 +29,14 @@ df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt"))) df = df.withColumn("subreddit_hash",f.sha2(f.col("subreddit"), 256)[0:3]) # next we gotta resort it all. -df = df.repartition("subreddit") -df2 = df.sort(["subreddit","CreatedAt","id"],ascending=True) +df = df.repartition(800,"subreddit","Year","Month") +df2 = df.sort(["subreddit","Year","Month","CreatedAt","id"],ascending=True) df2 = df.sortWithinPartitions(["subreddit","CreatedAt","id"],ascending=True) df2.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_subreddit.parquet2", mode='overwrite',compression='snappy') # # we also want to have parquet files sorted by author then reddit. -df = df.repartition("author") -df3 = df.sort(["author","CreatedAt","id"],ascending=True) +df = df.repartition(800,"author","subreddit","Year","Month") +df3 = df.sort(["author","Year","Month","CreatedAt","id"],ascending=True) df3 = df.sortWithinPartitions(["author","CreatedAt","id"],ascending=True) df3.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_author.parquet2", mode='overwrite',compression='snappy') From 07b0dff9bc0dae2ab6f7fb7334007a5269a512ad Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Tue, 23 May 2023 17:18:19 -0700 Subject: [PATCH 21/22] changes for archiving. --- bots/good_bad_bot.py | 74 ------ clustering/Makefile | 206 +--------------- .../subreddit_comment_authors_10000_a.feather | Bin 146762 -> 0 bytes clustering/fit_tsne.py | 34 --- clustering/umap_hdbscan_clustering.py | 230 ------------------ clustering/umap_hdbscan_clustering_lsi.py | 113 --------- clustering/validation.py | 4 + datasets/Makefile | 28 +++ datasets/comments_2_parquet_part1.py | 12 +- datasets/job_script.sh | 6 - datasets/run_comments_jobs.sbatch | 24 ++ datasets/run_submissions_jobs.sbatch | 23 ++ density/Makefile | 19 +- density/job_script.sh | 6 +- dumps/remove_duplicate_comments.py | 34 +++ dumps/remove_duplicate_submissions.py | 34 +++ examples/pyarrow_reading.py | 17 -- examples/pyarrow_streaming.py | 38 --- ngrams/#ngrams_helper.py# | 0 ngrams/Makefile | 25 ++ ngrams/run_array.sbatch | 19 ++ ngrams/run_job.sbatch | 18 ++ ngrams/tf_comments.py | 19 +- ngrams/top_comment_phrases.py | 69 ------ run_array.sbatch | 22 ++ similarities/Makefile | 146 ++--------- .../similarities_helper.cpython-37.pyc | Bin 10402 -> 0 bytes similarities/job_script.sh | 6 +- similarities/similarities_helper.py | 65 +++-- similarities/top_subreddits_by_comments.py | 14 +- similarities/wang_similarity.py | 18 -- similarities/weekly_cosine_similarities.py | 149 ------------ start_spark_and_run.sh | 21 ++ start_spark_cluster.sh | 26 ++ start_spark_worker.sh | 18 ++ timeseries/__init__.py | 2 - timeseries/choose_clusters.py | 96 -------- timeseries/cluster_timeseries.py | 37 --- tsne_subreddit_fit.feather | 1 - visualization/Makefile | 11 - .../data/term_affinityprop_10000.feather | 1 - .../data/term_affinityprop_3000.feather | 1 - visualization/data/term_tsne_10000.feather | 1 - visualization/data/term_tsne_3000.feather | 1 - ...ubreddit_author_tf_similarities_10000.html | 35 --- ...author_tf_similarities_10000_viewport.html | 35 --- visualization/tsne_vis.py | 187 -------------- 47 files changed, 396 insertions(+), 1549 deletions(-) delete mode 100644 bots/good_bad_bot.py delete mode 100644 clustering/affinity/subreddit_comment_authors_10000_a.feather delete mode 100644 clustering/fit_tsne.py delete mode 100644 clustering/umap_hdbscan_clustering.py delete mode 100644 clustering/umap_hdbscan_clustering_lsi.py create mode 100644 clustering/validation.py create mode 100644 datasets/Makefile delete mode 100755 datasets/job_script.sh create mode 100644 datasets/run_comments_jobs.sbatch create mode 100644 datasets/run_submissions_jobs.sbatch create mode 100644 dumps/remove_duplicate_comments.py create mode 100644 dumps/remove_duplicate_submissions.py delete mode 100644 examples/pyarrow_reading.py delete mode 100644 examples/pyarrow_streaming.py delete mode 100644 ngrams/#ngrams_helper.py# create mode 100644 ngrams/Makefile create mode 100755 ngrams/run_array.sbatch create mode 100644 ngrams/run_job.sbatch delete mode 100755 ngrams/top_comment_phrases.py create mode 100644 run_array.sbatch delete mode 100644 similarities/__pycache__/similarities_helper.cpython-37.pyc delete mode 100644 similarities/wang_similarity.py delete mode 100755 similarities/weekly_cosine_similarities.py create mode 100755 start_spark_and_run.sh create mode 100755 start_spark_cluster.sh create mode 100755 start_spark_worker.sh delete mode 100644 timeseries/__init__.py delete mode 100644 timeseries/choose_clusters.py delete mode 100644 timeseries/cluster_timeseries.py delete mode 100644 tsne_subreddit_fit.feather delete mode 100644 visualization/Makefile delete mode 120000 visualization/data/term_affinityprop_10000.feather delete mode 120000 visualization/data/term_affinityprop_3000.feather delete mode 120000 visualization/data/term_tsne_10000.feather delete mode 120000 visualization/data/term_tsne_3000.feather delete mode 100644 visualization/subreddit_author_tf_similarities_10000.html delete mode 100644 visualization/subreddit_author_tf_similarities_10000_viewport.html delete mode 100644 visualization/tsne_vis.py diff --git a/bots/good_bad_bot.py b/bots/good_bad_bot.py deleted file mode 100644 index eb57ff1..0000000 --- a/bots/good_bad_bot.py +++ /dev/null @@ -1,74 +0,0 @@ -from pyspark.sql import functions as f -from pyspark.sql import SparkSession -from pyspark.sql import Window -from pyspark.sql.types import FloatType -import zlib - -def zlib_entropy_rate(s): - sb = s.encode() - if len(sb) == 0: - return None - else: - return len(zlib.compress(s.encode(),level=6))/len(s.encode()) - -zlib_entropy_rate_udf = f.udf(zlib_entropy_rate,FloatType()) - -spark = SparkSession.builder.getOrCreate() - -df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_author.parquet",compression='snappy') - -df = df.withColumn("saidbot",f.lower(f.col("body")).like("%bot%")) - -# df = df.filter(df.subreddit=='seattle') -# df = df.cache() -botreplies = df.filter(f.lower(df.body).rlike(".*[good|bad] bot.*")) -botreplies = botreplies.select([f.col("parent_id").substr(4,100).alias("bot_comment_id"),f.lower(f.col("body")).alias("good_bad_bot"),f.col("link_id").alias("gbbb_link_id")]) -botreplies = botreplies.groupby(['bot_comment_id']).agg(f.count('good_bad_bot').alias("N_goodbad_votes"), - f.sum((f.lower(f.col('good_bad_bot')).like('%good bot%').astype("double"))).alias("n_good_votes"), - f.sum((f.lower(f.col('good_bad_bot')).like('%bad bot%').astype("double"))).alias("n_bad_votes")) - -comments_by_author = df.select(['author','id','saidbot']).groupBy('author').agg(f.count('id').alias("N_comments"), - f.mean(f.col('saidbot').astype("double")).alias("prop_saidbot"), - f.sum(f.col('saidbot').astype("double")).alias("n_saidbot")) - -# pd_comments_by_author = comments_by_author.toPandas() -# pd_comments_by_author['frac'] = 500 / pd_comments_by_author['N_comments'] -# pd_comments_by_author.loc[pd_comments_by_author.frac > 1, 'frac'] = 1 -# fractions = pd_comments_by_author.loc[:,['author','frac']] -# fractions = fractions.set_index('author').to_dict()['frac'] - -# sampled_author_comments = df.sampleBy("author",fractions).groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments')) -df = df.withColumn("randn",f.randn(seed=1968)) - -win = Window.partitionBy("author").orderBy("randn") - -df = df.withColumn("randRank",f.rank().over(win)) -sampled_author_comments = df.filter(f.col("randRank") <= 1000) -sampled_author_comments = sampled_author_comments.groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments')) - -author_entropy_rates = sampled_author_comments.select(['author',zlib_entropy_rate_udf(f.col('comments')).alias("entropy_rate")]) - -parents = df.join(botreplies, on=df.id==botreplies.bot_comment_id,how='right_outer') - -win1 = Window.partitionBy("author") -parents = parents.withColumn("first_bot_reply",f.min(f.col("CreatedAt")).over(win1)) - -first_bot_reply = parents.filter(f.col("first_bot_reply")==f.col("CreatedAt")) -first_bot_reply = first_bot_reply.withColumnRenamed("CreatedAt","FB_CreatedAt") -first_bot_reply = first_bot_reply.withColumnRenamed("id","FB_id") - -comments_since_first_bot_reply = df.join(first_bot_reply,on = 'author',how='right_outer').filter(f.col("CreatedAt")>=f.col("first_bot_reply")) -comments_since_first_bot_reply = comments_since_first_bot_reply.groupBy("author").agg(f.count("id").alias("N_comments_since_firstbot")) - -bots = parents.groupby(['author']).agg(f.sum('N_goodbad_votes').alias("N_goodbad_votes"), - f.sum(f.col('n_good_votes')).alias("n_good_votes"), - f.sum(f.col('n_bad_votes')).alias("n_bad_votes"), - f.count(f.col('author')).alias("N_bot_posts")) - -bots = bots.join(comments_by_author,on="author",how='left_outer') -bots = bots.join(comments_since_first_bot_reply,on="author",how='left_outer') -bots = bots.join(author_entropy_rates,on='author',how='left_outer') - -bots = bots.orderBy("N_goodbad_votes",ascending=False) -bots = bots.repartition(1) -bots.write.parquet("/gscratch/comdata/output/reddit_good_bad_bot.parquet",mode='overwrite') diff --git a/clustering/Makefile b/clustering/Makefile index 6f25a7d..7ecefcd 100644 --- a/clustering/Makefile +++ b/clustering/Makefile @@ -1,218 +1,36 @@ -#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28' -srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 -similarity_data=/gscratch/comdata/output/reddit_similarity -clustering_data=/gscratch/comdata/output/reddit_clustering +srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 /bin/bash -c +similarity_data=../../data/reddit_similarity +clustering_data=../../data/reddit_clustering kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000] - -umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10,15,25] - hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15] -authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather -authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI -authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k -authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI - -authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI -authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI -terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather -terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI -terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k -terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI - -all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi - -terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv - -authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv - -authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv - -terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv - -authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv +all:authors_tf_10k_lsi authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv -${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py - $(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) - -${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py - $(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) - -${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py - $(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) - -${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py - $(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) - -${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py - $(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) - -${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py - $(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) - -${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py - $(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) - -${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py - $(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) - -${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py - $(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) - - ## LSI Models -${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py - $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) - -${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py - $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) - ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py - $(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid) - -${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py - $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) - -${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py - $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) + $(srun_singularity) -c "source ~/.bashrc; python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)" ${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py - $(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid) - -${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py - $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) - -${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py - $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) + $(srun_singularity) -c "source ~/.bashrc; python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)" ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py - $(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid) - -${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv:umap_hdbscan_clustering_lsi.py - $(srun_singularity) python3 umap_hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/umap_hdbscan --savefile=${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv $(umap_hdbscan_selection_grid) - - -${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py - $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 + $(srun_singularity) -c "source ~/.bashrc; python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)" ${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py - $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 + $(srun_singularity) -c "source ~/.bashrc; python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2" -${authors_tf_10k_output_lsi}/best_umap_hdbscan_2.feather:${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv pick_best_clustering.py - $(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2 +${authors_tf_10k_input_lsi}: + $(MAKE) -C ../similarities -best_umap_hdbscan.feather:${authors_tf_10k_output_lsi}/best_umap_hdbscan_2.feather - -# {'lsi_dimensions': 700, 'outpath': '/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/umap_hdbscan', 'silhouette_score': 0.27616957, 'name': 'mcs-2_ms-5_cse-0.05_csm-leaf_nn-15_lr-1.0_md-0.1_lc-1_lsi-700', 'n_clusters': 547, 'n_isolates': 2093, 'silhouette_samples': '/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/umap_hdbscan/silhouette_samples-mcs-2_ms-5_cse-0.05_csm-leaf_nn-15_lr-1.0_md-0.1_lc-1_lsi-700.feather', 'min_cluster_size': 2, 'min_samples': 5, 'cluster_selection_epsilon': 0.05, 'cluster_selection_method': 'leaf', 'n_neighbors': 15, 'learning_rate': 1.0, 'min_dist': 0.1, 'local_connectivity': 1, 'n_isolates_str': '2093', 'n_isolates_0': False} - -best_umap_grid=--min_cluster_sizes=[2] --min_samples=[5] --cluster_selection_epsilons=[0.05] --cluster_selection_methods=[leaf] --n_neighbors=[15] --learning_rate=[1] --min_dist=[0.1] --local_connectivity=[1] --save_step1=True - -umap_hdbscan_coords: - python3 umap_hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/umap_hdbscan --savefile=/dev/null ${best_umap_grid} - -clean_affinity: - rm -f ${authors_10k_output}/affinity/selection_data.csv - rm -f ${authors_tf_10k_output}/affinity/selection_data.csv - rm -f ${terms_10k_output}/affinity/selection_data.csv - -clean_kmeans: - rm -f ${authors_10k_output}/kmeans/selection_data.csv - rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv - rm -f ${terms_10k_output}/kmeans/selection_data.csv - -clean_hdbscan: - rm -f ${authors_10k_output}/hdbscan/selection_data.csv - rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv - rm -f ${terms_10k_output}/hdbscan/selection_data.csv - -clean_authors: - rm -f ${authors_10k_output}/affinity/selection_data.csv - rm -f ${authors_10k_output}/kmeans/selection_data.csv - rm -f ${authors_10k_output}/hdbscan/selection_data.csv - -clean_authors_tf: - rm -f ${authors_tf_10k_output}/affinity/selection_data.csv - rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv - rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv - -clean_terms: - rm -f ${terms_10k_output}/affinity/selection_data.csv - rm -f ${terms_10k_output}/kmeans/selection_data.csv - rm -f ${terms_10k_output}/hdbscan/selection_data.csv - -clean_lsi_affinity: - rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv - rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv - rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv - -clean_lsi_kmeans: - rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv - rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv - rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv - -clean_lsi_hdbscan: - rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv - rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv - rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv - -clean_lsi_authors: - rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv - rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv - rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv - -clean_lsi_authors_tf: +clean: rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv -clean_lsi_terms: - rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv - rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv - rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv - -clean: clean_affinity clean_kmeans clean_hdbscan - -PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k best_umap_hdbscan.feather umap_hdbscan_coords - -# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py -# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS - -# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py -# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS - -# $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather -# $(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS - - -# $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather -# $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather $(clustering_data)/subreddit_comment_authors_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85 - -# $(clustering_data)/comment_terms_100k.feather:clustering.py $(similarity_data)/subreddit_comment_terms_100k.feather -# $(srun_singularity) python3 clustering.py $(similarity_data)/comment_terms_10000.feather $(clustering_data)/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5 - -# $(clustering_data)/subreddit_comment_author-tf_100k.feather:clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.feather -# $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.parquet $(clustering_data)/subreddit_comment_author-tf_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85 - - -# it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap -# /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather -# ./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85 - -# /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet - -# start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather - - -# /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather - -# python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather - -# /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather -# # $srun_cdsc python3 -# start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather +PHONY: clean diff --git a/clustering/affinity/subreddit_comment_authors_10000_a.feather b/clustering/affinity/subreddit_comment_authors_10000_a.feather deleted file mode 100644 index 21e15e473bf8bf64c4dcee0a7b02e0626566d47a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 146762 zcmZtP1(a3CwlLgoJR~;>NswT{-GjTkyK5tjOXE%gH0~DMEx5b8ySoL4V2%Hi-21)v z#(V$CnBC{>eY$GzRjXFjfiSv2#fs&sWef=T>`^^jK*09_Umj5dz72>J5HTQ%XC8qO z{_FTRUPk!O@i~ECo;%H#<57J5axTh0&;Q+4B+uJ6XdTp`LqI?qZ|t5ZxOGtDF7+C< zY1y$=>kf%>Ca6^>O@c)Kep&CIbB+Jk(SKfcPt>|WtH$1*s6)qw?HdOL1&1W^b4`L9 zw+yQH|9k4+z19oq+V-Cx>)1NDQJbK~Uw)`{$5w5-{?~~%4VyP^^iN~08izCpY7o*u zFReSaY}qqSg8%NdQOk}ULK?UKzkU9X6aTBn;MO5oGymWI{_F7n?(UyBv~S!Xq)mHQ z&^=Mh;D+rRwD0=mp4)b9(7t_}&R-hq)VO_z;5Mzl9L#~a~uDRVH8KcjG;$heqX*1_{r-?j(!>C z@Bfnvff4_AJ6}2bU!&uBgn$0?#N$hTo%j8xe_nmbmstPFuy6ipH6VP1fVj@J&GzMr z|NZ#Jk?;S>sDHnwi^E^u^QDbON&fwwFJJ#(KN0`?(EoWhf7HJZE%?v(YiYk^Z9qWi zwts*B_u2n`eXH~TI}-3u4F6|;U(WpR*CYSaZiGZ-V%5*z&&xhv-u5qIl89tvqa;;m zKvT?&fZqJZ0ERM-*(_oio7u}jj_@Uv*Li_44fvP4_>ma=Ogs{jl8j^~FU2WEC2A8y zb2`w8o(yIrWdCCU@!YeiWKzve?fov3{0ySt#NBT09>8xNYdpX24Zt{?~d}W@*Au+#F zoa(fqH)EN_pKM_dC%M91!uY^vqRPi;L?;f(NljLAl7~W+r#?;T$Y4hC2h&-=Dt2&y zlU(5rPce@IBJd5-i9-_7lY?SZqc%Zwpeq9z%xL~#0V~+fX>Re9aO%TPBqcjVC{I-y z(T;8mXA+B9!3H+7ll`3LF0c7oeThq2ekC7es7_s4(1GssB9u`~U@9}2&ob7sjlCS< zDi8U@*J{$Y{6JiilaqWDr8E_(K|Pw#o?Z-MBGXyPX124B!<^zO_jt@}zO0u~iA`cM zkc*;Jr7`X4$q>ddl{u_n2WPm&10M5^C|?^}5|V*j6rdE<2_}Tzgfg5zn9V$vv6e%e zu_piA^%nk(5ipi-MG(3QcHFM|v`VNz7#lt69f>j&p_kgb^6s zmWqE7jaVcj6IsbgAdCmvI#Slw=2W-CWI$3wyh_p@sx260G82J%v#It0^}-xw0G@%1M>BmSWvVi66;ye#| z$0s5t&@Mj`hje5nCwVDKRT|KYPV{3G)0oFX{$vB&IlvK4af$2P=Lv6#oKT%1Hc3gv zujHmQm8nBB+R~i?j9?=3S;uDfagM9p=M{m8tU3Hla*BYhdi43@HqJ)Gnwk9kAH7E4&mE)H>qOI+n4 zuldB+Y1C&@kbxW&p$yfjM{9a8glVi~7st55ZJzOwh-qCnKar5sWFQv>DM@vj5<)Kq zGlB`sVHs=L&T%eqo9BGyn{;ALVp5ZtycDAXm8n4^Lg+~-qnO1iwsL@zT;eXz_{4YV zHolr(H zi8(A~9lJQjMeg&M*92xXkBCk(GEtODRHqgVXhsXVGMKR}U?tl*##yd%pBH=}QYK?d z0#cBJyp*OcZRy8Yrm}=}Y+*OYxWq$V@}BU&nm_zVQZkZ*yp*9TjcH9c1~7)1EM^s3 zIKXMH^N4`V#*)NjB{!w1NLA|5mQM6z1QS`n8n&~K(_G{MFA3u_Uu6+%;*)~(WF{xY zsZ0Y})0tlMWfujHpJHEB!>+VMNXnaC1Wv5_O3 zIUx*UG&5Pma#pd9y`141cX-4* zqUQ4Xil0eFCUR4bnlz^qeHqR~7PE=toZ~ic`8Kz8oy7b~PKr{IIy9#%eHh7PX0npa z9ONp``OJ@bv_*PyP>51gpb?!J!bm2vm`&_tA4fUMbzTxiKwk5fUr0h4ex)#_s7xJ# z=}aF&8O1!7v6hYO=NRX?&LckZRX#B#A?e9RLCR8v`n07VLz%>U*07)BoZ=dP@tA=8 z#)@eCLMpP6ml9N@DoyA}KL#_A*(_xvJ2=EqE^wQ_c*HYa5Kutw5tCm?N(Qo$mqL`H z9*t;4Zw4}g(TrmT^I5@a*0GfXoZ<@i`AEcqYCrKvNk;NhiW)Sd3;h|%1g5i)mF(mM zXSmL7-VnKvy(>QwkCbF0CuL|rbK233{)}KOlbFMwtYRa3IKgG^@ruC0#+>hoNdi)m zi6WGwI(2A58@keup^RY`i&({04sw>eJmhcQ5}}AXL>!WkfdZ7GGIa@}HJ#|oaK(irq8Edi$^zE2n^WB25pM`rTx|G-)MO$T#i>qHn$wwHj9?0jS<6QzRqD}#-h?ud8O&!n8`#HT&T*Lsyx=ntO1gfc6Ne;Z zCNIS)OJ!=(oc461H-i|*R2H#;9qi{EfANZsL@edct4K;_@=}31w4*Nr7|ujyvXHgx zq*3*6v29|&Jg?h=<2WFa3F zXh17^GlI#?W+@xl%}H+any<^dPZE)y+?1jsjcLbk3}YPAS->i`vWtBjnlIAR`4RM-}SRlCBJ7 zIAfW{QZ{pft32cl5vr;ad{06$lbynpqdE;~M>qNq${40HgZV6GIjdREX7+J}vs~jA zFL_I(YQ~+Ph($8el9ggqrXj8A!tab?7E9R79`iQriiO51ON>G+c z)S)RY=twtu(T`z_X9{yz#%}g;ncF<&6On7^i&&%~3ppuE9h%UUfsAG{OW43}4s)7Y zJmNE7)l_eYLn<;+jPlf?9>H`Yl#xth8C%)UF)nhO$Gjo1mYPT$(vqD#6r~K+sYg@V z)0ZKPXBu-^&RTYHnp?c!E#Yd*Ct{PBv}7kg#i>9o8qtCt^kXO^nZQ)$vXJ#`GEQbfQ0_n87kObCB~~G!P3Cl957`q6+m1 zrW<`2$t;$#p4}YfJb&?ww?t@YJ>y5>l8)RIqat-_L01Mco&_vpJ=-|J6&~`AfJUyB zABjx@(()@g$xms@QH`23q!k_LMqfrVi#6=vIJbDtXTA#32k}TtcJfn*YSf_>9q7St z3}H0W_>;Bl;S4u;M!3de!p|fi6~B^;!c?R-&1g$^`ZI-ECAla@MXFJk zCbXg_zcHK%%wYv<*}`7VaFHwA;Ti7<-%O1rChBk(E+Zp$V;MM;H1sh)GOi5gRzfMQ-t!w*)q~pCCFZ$wq$4QJZFTr#Jl= z%1EX#mv!vt95;E$D?Ssog}9S|j@$_v7`)(77ckMv|AFJ-7qZR*jG#NX9aY`7C1t+d0G;?(&>Bd>~Q> zwVfY{PYTjgnDTu2U&7ncfgTKJ9Fv*FLe{g7(_H5PPk2vYh?o$IxFjGwIVnMHTF{=J zj9?av*~Uq3@`i{VT_;J&N)am3fYx+l5K~#qGB&V<16<`U&v``{ft}nx-xGtRq$L|g zC`T2l(||^FpbNbi$Z*CnjRmY?BRe_F8Lsh!4}|NiUhy+YNJn<^Q=AGkq9q~pr9UH? z#1v++icRd`0H?UZBcAe>fG*-s41OUQ*(pga>d}%O^k*#7n9DL&vz5b~<1b$Gp3g+= zYW*QP2}n$O@=%O2G@v~L8P7sCvY*r3;Vt31$w#6QhhIod4)Rfq(p08C!L*|jz4)DB zOk_T**uiOTai6EW=Bw_mnFJ&!7v-oyFd_72I5SwudUkMvYdqpL5qhX=#32<~DNH4r z(UIN^Vl=Z^#~zMzmP_2^K5q%=sZD++DQU?}K8jL?nlvDoP7Gi&Gg-n4Hn4{ioaYjc z3Fsv+h(=5jla+iFqZ$opMPJ4;hefPqEBiRjbsq9JpNQI9yTm6sX~;|gD$|fww4)Qh zF@_1uU@`01&S5U{h}V1~TpxR0z9j~6Nkm$*l8=(qCWuyap%25Dz$})sg~MFp4v+Xi zw7%LT6Ot?UIJ<CSJ2GMb4@VLG#z%OaMto&y}`9M^cj6W$WoU+xl{#H1!G zc_~FLTG5w5Ok@gkS;hwTafy4pCF%hC1AZnY8OTmCD${_r^d^*%OlCFoxio)aNdy&?vQNX@V0q8JsZMJu|}o8KA17{)W5 zRcz(}r?|o`9`lkg0te}jXv8G}smMf53Q&@&G^Pzb7{CZ7F_R^1;Rt8A!e2b+178ic z#~~4^$x0rIQIUEyrUN|~#0aLan2qe^443$er@SO^i1zt`SR^GiSt&tHTGNFd^d^*{ zj9@gAn8GX;u!7wj<`g%1LAasT5fYM;bQGX0^$B4hW0=JfRsuB`vurPA!_yj-Cu;3{#oOQdY2yo$TQ_*SO0g!iYRvj7UscvQdz-)F6m9bfZ5b znZ!&MvVsll-~cDM%srkF#%H38kiWzyIhn~z5vmbHYdX@4QT)MD*0Gag+~67Si8xXX zA|{DRLq>8?kdjoS7R_i!5Bf2JX)I+E`?Uhtl8MyW9*BOL`PMHL#;j&Af}2xItz zFaL~f3F|q)5iW3A3z z6xkrp7V~6L>^~P#E&E;2PLRT9U9V{j`U?9Lm19Drm~dvY~c`R zxW!Zc<`tj#X1sogO%hU&iK0}a3EddLNX9dR1uSDTdpXEaPH>hB+~yhYi7>%_fat^~ z4VlS95sFimiqxeA-T9rNjAT61na5H#vWG*Q=OJ(S$Tt)H{RHtzMHUKEmYM_+OglQ$ zi{BZ}XeKhB)of)S$GFH<9`c-bM46;7;**^8WTgP5s7^!L(w#v}WD#rF#c{6jh%h4l zVb1UqzmS$}6r>C_X+~>0(}$5vXFki>$XB&rXa#4f|RG|h9X-7x8@*9I0%M2E>mMt9QC|9`8bHWIm zrcZt%9?8i>b_!5}@>HiOtq7qXgBi&*=CPEuY~vuOxxiH(@QMi2)jDF5gmh%30Hvuy z9fIk|Zwz8IQ<%?6wy}rfT;dLocuC|L?ui&AAr(2vM-j?Xiy%ViMSlh}f=SF`K5N;| zUQTg^N4z2YO!-JG(vp#!6rwa$sZCSb(1D(WGMZ`3XEoc|%|T9cjeGpfMbt!YO`dNP2q%w`E2Ilvh%bCYL$`Nz`V5ue0lAUpXfN;&G$lFt0baHg?} zjU3<%*LlnvJ`!b)x#lAT=SqZs9=MgyABnyw6B3^Q2CdUkS}d%WWlQRd2Dejx+7 zC_z;k(~S{K=1(@UmqVQAF3{?1u0HtYS5ULbfOo-7|T@Vv6z)?T=f(Q%r#m^)q zE4j!=5lT{#DpaQ~K{TT!?dVKjekYWXjAaT7*~D&6a-FBVCDKBB1AZVLsmMegN>YJ3 zw5Bt?=+9urGm8cM$rcWBjEmgiZ^8(-NS^UC>Bvnns!)$6G^Y(62xScOSxTGW_*~v#i%21VB)T1qZ7{pW-vXT9q;TjM5oA7_iL1L4b3}m4Y z<*7;oI?Cg{jUh~BF`GEZ1@7^N2usD8 z1f(MurKm*gBJ8)Fq4?hA~vy~6Wro&-tqNv*FhYT zl9Ehhr!ZBhM+bT_ikYlr9eX*;74Gwn$Sbr-Y?6?Pyi}keofybyCNPtQtYSNdIKw^O z5^kk?B|d4$ML|kag*r5)GyNFDRQ_Z&yEx8iuJeqSL|SFP$dAM!3E3$^W$F-2NBS|I z`7C2CJ2}lgp7NHctF=dBGE$JTRHrem=tLieGlqGrWFP0b%UdF>vBr~(U&%#js?(A# z^koENn86&Du$4oc<|=pii$}cR15wwyR^pO?RAeCs1t>ykDpH$Z+7d!vLK(^!CNPbK ztYIg|IL8eh@Qz69ISu+*rX&Qc_>LGYSWZ9^rSC?7{g?yvyi2% zU=0U2%Uz!Eo^Lm*EhHs9IVnn2n$m$T^kx7<8OJOZvxe;)<``$W$UUC&hJa1(g;*pc z6?rK{amrJd)^wylLzux**0Ynn9OML7xygN=^NzsHYCJ!ah-746rwzJ2&O&V>CF(vFqL^MWd-Zl$9Zn@jQ51wW<4P~zmSYy$xmTQQju!Zp&1<+ z$Oy(Tg@vqOBYQZ`S*~-RSA_A2@Y~IMl97=tWTzMvsX;5+(wY8@U@~)A!WK?&fqOjX z4S_q{57CHEDl(Fr;#8p-9qGpJjA0^En9F*0aD>xb;x12l%?G0Fv=<@{$;d!1%21D{ zw4xK;=*a*^GL{KUXFf|=%O-Ykl#^WI2CoReOMk>54oOKxIq# zeH`OFw|LAazTIQql7LiXAvc96OGWC^h?aC_5F?nxLRPYa16<}dZ}>pqUY|EeOiHql zokEnPA`J+pJ^dNX7-lk;<*Z^In>oM*?(v$h_8Bh{k&JX?qY$O2OkG+K!f#Aq2FuyR zK2CCn=e!~6e)&y2l9HAjl%O&-2%-Z$8OkW8u!QX#;Uf2W&o>9egG3}F6FDhPMe5R& zmb9lgBN@jWR4>OJvENj6GRn?|&x9U=5$5W^V9T$ZqcHEduDJJ`cP zj&Yic+~7H3gg<1?5SJvRBO4{DL=f%hK|h8tiN&mC8wa_1vQe0F z)SwY<=|~@jGMTxoWCw>h%}rkLhWC7R#QY*2X~;=o%20MW{wS8WX}G#xsYNY~?tYxK9`n&iG8i zFC-%)1*t?mTG5R`jA0(DSkGqma+X^>;3=PoeAe}mluYEH5M`-BLxSl>FNQFY1#I9T z7r4Va!k=@UBqc4G$VG7~Qio==r7OQNobfDV9Xr|274GwjaOaI3@kvi^Do}$4G^Qn8 z>BC_DUCJBpWHggm#A{7gKOk%7$Qp#a6GN?n@Joc8o(FcX=^To$v8HEd!x`#H;P9`l^fe0#~9CM(4# zPfeQ8lJ4|lBvV<+TDG%~i`?b`&j=&%va#SN5|V<9WTyxvsYDRX=s*t!FoLnnWeGbu z${FtPoVSF#A~*S#SR^GQc_~8`>Jmf?y6_t#n96+qWDUDG!3`epf{%QC)#pl*k%?U7 zrv&AxO+%W|lD2f93%&S_fec|3f3S#^tYbH)IL}oc@s4oU#E$r+A`1m5Nln_&jS);_ z27j`eJ)GqRkNL>IuFESjkexzQq%O_rN*_itlQnE*9~XH{7~yV+2{B1dR*F%XrgWq) zBbd!nwy=}E9OXRMxX&v-^3_fIHsX+s^yH!h)oDftdNQ0z%wYkm*urkkbCXBBA>1uB zmLG^qYO;`z;#8#;4G5+^9qG#uCNhh8{K;l^a)^_h;R2Vr%L5+sf=@)cEkB7v8nTn0 z(o~^7E$B#JMlp@WtYsI+xx`KG^EV#|e@ENIBq8a^O+iXfi6*q93xgTW6y~vt-JIbr zVMMyC4v~N~Gz#tx2imfJk!B?0%0FR@8NdU8{Ua@3$cO=w3q zdefgFOlB^t*vei`aE<4@;R8|svOgp-8OT9lDpH3gbYlP`n8IRKu!ED_+;sP(~c1OF_bY(Wgd&zz*Y`&it9Y(0}&pmYkWs^5|EK>e7H9TF{1W3}FnjS<8OT zbC0(~d?a2ZBNaI*PDL8gj861tB$JrWB9^g{!<^wdk9a}2$LbH?6PKiYuG@}ze>B|ttGJz?~XE{4K$|Y{|kPk$A zs-BaK%;cpkHK|74EKL;})NfYQ{Y5pC(pV5YH%6>MP-N4d^Z-Vy$9 z>l86bLOL>$n}QUh0yPPyGyNIH1g0~eRcz(}$GOBqUK8*_jEGJg5|V;+WFZ%MDL@&j z(v%MLVF;6$%@WqLi=&+6441gW6T*o6QeN>BaY;f7vQdDNRH7F32%;$=^dXed%wQGk z*vcLbae{MP;s#IoNZ>2EKoZiEixO0(DQ)S=K!z}q>C9&po7ur$&T^ASd?4a$doB`_ znylob1ZAj94Vux05c)EJQA}eY>)FjwE^~|fydlCHaU(kMNkIX{A@pJt6PUtm{$wkAIm`vFaGl4zB#e(l zervrZ260J5D$7^=V2w`Z9G-}5s`$xZ>vP?-j_CWLN; zGKJYJWDT3y&OT0Yky|_=+-Et*PsHLE5|NZNWFtStDMdwU)09?pXCR}Q!2(vYiDR7Q zCXac+J0b-H21Fw|2}wg{3Q~%iw4xuQn94#nvxgI$=RQw(&u1b81_pdhG!l`D3}hiM zMJY>df@ncodeEN<{K0G%vX;FZ;xxB-%4POdv>=253}GVE zna5goaENm}P)0JDxvXb9dpOK_Zt$2- zd>uJ3AO;CZLl%lqh3eEJnAY@S7}J@@3bt^V6P)HEk9b4GC}Ph~{6ccll7+k!rXD~CAC zb?)6bX+;kPF_D=pVG9R1&mCS8 z>0jC+IoT*m73$NDUJPY2^I5|Vj&qI2g!@)*5TDfKpcGZ9MH~7tl+jFKAuHI-K8|pP z8$9JTQNNP|q^BU|sY?^u(UsnWGKMM4VhO9+z%~wYlG{Aw6(9NLd-qN}5|fb}6s0V6 zXhBDMGMqn{$8t8Xk7HcoCeI1u15u*M2Yx0gzmk)pl&3aLXhl!@FqY}eWgUAs!db5I zoVWbzhroazi9-tVP=pH9qzyd@WeQ7J!4{5kgU5vNnaDqi1qn$;T5?f>x-_RN!x+zW z7O#&~A3n2qe>EI0U@&wL%-noJ@xkc~W) zr5?=*;Wvgcj_E9BC2QEq4)$@3i(KIjPk2k@7-G!##3vz{$wvvwP?Z|gp+1cXp$~%@ z#boBPluhjB07tmYE$;C*ANV?^`y&DA$Vq;RQJ$LAr3oQ)p(g_v%xI=Eo3(7`7&rNw z_e70lOh`sr@=%o0RG}`xw4pBp7|9lrRE+R=@d?gk+=u6{$``8q=B( zdNGJmOk@E|Sp$CJR#7tJOnSGq% zCQo_AmtT1GH9wM+lw=}1`6)sdZ#xRwItY9MtImvl$aF^G7B*HJ|7_mu63Nn(9Vw9r}t?0%OrZSJ^ zY+)}axXJ@w@|h?JTsNusm4Z~I0WIjt5XLi~jqK$t_j$o*zDa0}AT?PjOj#PziCzp~ zB8ynhLC$iMC%oZ3krK%fej+YONk>i!QjTiWp%EeUW;D}T#Xin(g{OoOKCwAXVlt73 z0+ggG4GE?V-RQ?4#xRv-Y+xTJxXDA_5I%_*5Sz5*p)h5sMtwTahha=)1}oXjK8|vc zyFBJOpNN{&ydwsQ$wW>HP>hPyr#Y?YPJf0ol|NaC`5T`(3sZr;CF^Gk=gvo8g_Do zGu+?_VMO^=AH*Rg>B&xE%2R_n1ksYVbfi0j8N(keVlCS_!WACyfoPdsI~mANaca<% z_Vi>lGg!tJj&YR-d?Zp9aV8mAC`@IV5yEebVkV1N!#>V)hcLd*s^*Z2T$G>|P3b~^ z#<76)?BNJ!xW*G+5s=NCiIiP^COPTI zN`6XFjrugFGrjqZVN7K%%UH`!j&OlH{7pa(Z4;M-BqIYk$WI9>Q;P<)rYk)e$Ph*{ zo;j>w4V&4eip6SfvPu8)6L!9R>Pk2k@ zTBB(AGL3nxVgoxk#TEYI6#;q7L!uFb*d!z^c_~R*s!)RXHMlpdotY!yCxx^!0 z5iYNqOFS}>hZ0nxHjN3U1Kk=|hH0!~3;VguU7qrm@cGq3q7#o4{7P<0P?ZKWrwe@<##kmXn`LZdKc~3IGu{(e zKx~LdB9f7o>=dRHwP-=v*fRa?820^r=8+{qXD8?~~X)I(JYuU?T zPH~+_yyUCG>L2k*NhWepnrbws2mKkrRF<=a16<@Uo)Jc*BCegdq#--`sX!f?)17_{ zW;F9z#ukoqi)VZ$T2X(dO-gc6oGLV;6TdT(smx~)%UI1$4s)HSyd_dGV@NFGlY$)N zr!3W}PYc@8jlK+KG!vM`64tPtQ{3P$Uhs+V#l?Y`Bqkjh$w?u~QkmMcq9=nH%Pf|% zp1qvp3Qq_lY6;g*GBS~$qEw(7LA0R@{Tanv*0P-w+~gt8c}v8S))wNDkvx>30<{RD z1KsG$K*lkP#jIf)$GE@^p7DujrNo+)syfxeyeT_w(KO|GzKXXJ>Y{%{=AXpUFGmLdZb`JK&RW9qfT`!Vlpd z_!&F~&%kpq1L=3*x4;5e0?XkPcspzb9ZC>F6+Q$Xg{$H7@MX9eZiAn|1Mnz3124jB z@Fry5iTr~e=!NC55>A9OVGCRWd8oig;ad1A+zj7^AHt8}e)uK)4qkv);2>z{^Bd3u z%V0IEhjZXva0%G(e)t4j17C#Q@Ll*3JOICd$KfgXBOHLlCiD+bnGW-itn;B=&14qFM7=lw_18jv$A%geA4)_RM4cEera0`4N?tzEl*YF4U z8@vh!;SI=aL54sN%!S2p46K0TVHi$<^Wb7AKmgm}YPb%*4&Q~l;3x1icpP4Ym*Eg- zTWJrN568l47=iP_01rM0SHRV9E!+s-f!pB$cmy7U7vT^jFQ5-03v*yGEQfwL3C@7? z;Uc&MEb!nm*a26==ivsp5pIUt;Q{y+JPj|v47>@c3;8WL8rH#j*b1XC1{L@Kd>D4Y zm*A^#Gkh2BhKFDu{2unhi||+YCur|Nhrtoh2diKN&VdWz-S9rJAcF1iDYzcK4tK-P zVITYf{sFOzXd~!`URVZ0a59_;o8eLzgAc%!unTU0ufq@ELD&bsgTKPRKzlbf5X^-o zuo70o>2NM=1swv|2A_qmz%6h$JOsanr{M)S2z(|a3y>mZiL(5UU(G#1TVpBkbW=M3Cmy& zjKDiVhX6hVSHZP#6MPST0uRIE@GSfp{th#ccpvf^7Q#wc0~_G&a530WhAFrbz5qAD zE${<)2p)$&z@Oot(0mEw26|y7oD65f7I+U>FbS8#C*fN73Va9dfuF-~;Lk7vZ$N4k zUk2vG(QrJhh4pYgj6xYc2-m7n>8-4)y!f)Wu@OSt(BrW74905nc@h}K$;S6{;SWt%d z!>8dUxE1b(hv9$Vckn#C4F7@VJl73NVI`abXTc`804@OsCg3vI0awHI@O8KY?tzEl zG58}q2d~0GNE8?+un3mH39t@E;2by~E`;|%5vJgB_!#Vj>)^}q4Y&>Nf?vQh@B$o! ze?z>;wLlm2!ZH|!6X9$)A1;DRAP+u#0Iq^h!wqmV+y*~`C*gT`1rEYn&|(vtgC(#U zPKR?~6I=uiRNzB!Ia~){hHt=aa2Na>egn_Lt8fV3gv1zjEA+t%SPN&t1#mIspa?Dm zFbS8#F8CtshFjn^xD)P&hhQK44qk%SAy#4>!E87RR=|m{5w^l5PyiR+4_CrXPIwrefT!SDcp2V+w;(mnJb~G;5PIPlSP6r$8cu=@a2}iw?*apg zFb?mB9k3HV2RFgD;m7bZcoO!*tMD3R0&G<1gXJ&`r@`5<6?7;<8Fs)+9hZ)F(^gk?w<GEBnda5a1Zz6E!}{qPw44xWdgxhShL7yaV0`IdEYDcEHErGq4-J3Ae*N z@BsV|JPpsoG`tC!3C0dAfaBmKI0H6=2_fu&Ps8=_O}GQ@hyQ^;!popd(iYGMYv62n z7g$h+8hjW&0lVOOxEbz*N8q>cNB9fOz#EXLVTV93oB*f8+u>p;LK(KhC*cP8F5C`3 zf}g>!;J5HQcme(b|9~0zH#AK#-e5i~hjs7{xDYJxVG=$L*T5dQ1@^)Z;a+$c9)mx^ zU*I+P7bM@09R+is4+h~BI0H7p`@n$;TmheiYvD`qHMkS*gI~cucmke;Kf#~j6?h%~ z4Y3bkhr&Ww4nuGnybZR(d%*w?J^)w1N8vN@W%wq14}J&_!Y|<|cphGXHz9QyZ3T;A zHJk|-Kn`4(f-7Mcd7Ys1`{f9 z6?_`L3U|QWa6dc&&%z7vS9lp-g*T!3L+D;u2*r|3Koy%rR(%E|?F!a5OB30ay#C!rNdIyaz4? z8!B)ad;~rTH^5in4)`fN22aC&cnN0UUy$0yb;BZ94r|~vcpJPO&WEjV5nKvm5W$Dw zWAG`s7WTlca34Ghzk&aSm*GE<-cH}aQ7{A}uo*@n4=!8=AA`@qm*E?*7jB2U;C}c8 z{2HEwXW`HA3LJ!gLu?0g3ff>cEQDpS64t^QumvuHi=hBMRNylBIP8M2!*}8P@DunM zJOWQZ80Q@LNv^hy1zR+)M&^z$Vv@_7;!ZL4a9?A0ti2=xL72L}3A z8IE7Ig1l`xrlWgS|G=tMc9^$4%Pv%`Fsu}c-D>lg6_jc%>QGDBvAn3(Kj;U8tNI56 zOAmc7oG8}%hZm~)=(_&?{&mCs z14Bi>5`>j<*$<)t-LZ34pgZ9}ZXg;kasx$6ccNmBlN~6zwaC)l0oSqvXXwA1l!|vN ztyR_646Z6Wl`t&Xo}mZU;HtnkN`w9Vg9H78!>gtS*MRPh`6VkH)FVBzU2D)>J!rbV zX*sr67&Oa)?M2OM`r2^N3{8Glws|9*(97%;b)zH)Tz!(qvK{C~ZD>`AKgSO&HhxeF z4cqbzc@mThL+h*{95RZ!claR+DsH)Pu{qzYg^oYLdFF;nQFW+Hn>z+SU$BOPisxCu z(B#n2Bu&Mk;enj*mxc!>EUQFM8a$Ds!!cE#qGCCsPw5>Izg5&3OXYHzRos5~ht#c(U zYIlW=7jlVa55+oR!PlL2@}970my3Rx-Zt&9tVc$X%jFrD85P&%)NrpLBKG3-uhM!}lzc5#i}M!vNC?ZC)^F zd4`GXjZatsgOH(=GkVkkEvo8_96u`RIfudSr73lH?5d^bd}&x(Jo3x7E>9!d_cH2@ zm8O-`8BsaLKEIf=YQAUcxj^STqP#7?5>(1unC`P1`c5UXoSNrHuEk|J-74LsdMn;3 zYUV^sI|kiq);js8&}!_u70(W%9#vp4+M(O2zGiv4 z9fs0%J!*oH8%8$&vSz(w!s4Jw5BVNn=DTGpvZpvbOP*ECc)g3A3YRWT5;?RsCu&q8 z2BvP9l`<#yx~ivjgFzF~Ef(iy@jbj-H=ayP*IZ6-v{>;Xnt_wm4TrzH#qU>jc3m5} zq8svUe45KHGdr!w(A~0bjPsGe%2|%X8L`Xz`PPVDlSkjnBZUGkz=#~XL@!l%tKWDr zp!GOhVGvnij;0FR@iHpXmU-k-$*<5lbuHj*qe|dOSIAW+Rek;wA$!94awW_(eyjH!|wLqP+B8IjV%NCG*@iDtxkN3a;f^ zvYnvAJbKa8!<3p{$rYP+Ama>aqMOWbBhZ;2?D2cKisP7mK{x%JWjjc-h#v8|NS_nu zVV8Q=FGp^z%8Pn|GrTNbZ!!aZ010L_ri;lND-OL-%cy=@Fr&7Im7r=@{a~S*NrV%0 ztXY)-Vlu=%gnNg&g8>*I2CKGV%~B&Gf!?O}ROwL>H>TC9eoOl!#%)F*1Fl;oAB`oJ zs%j8AmR_i=RyQ%pWxCj&zFPGoTaQxfnP#D@2KmmEGCY3P3ER{`Eh<`kMW{1Z5gUfp zr0PAESK$0DnWwrn;pg+VVKZvjIyOVn)q|2%jMe9%kr+q$67ro%k!Lqx((vfhu6Zun z%I7U33Q-3Me#i1mIuoH+LJ}cZ-jUuC6!^YQ{^dqCVI_B1C51g|8@z@o(r% zPm9jf3z5$6+TI3jpcvSDlJz6)l5P5&TF#%)W$fEQ&dc%0IrDitC{!4WcEHzpQIp!> z)4NMkv9wA(lZww(u`5(ln>x^8FttowAJZpwH>4F9%KBKKmxk8pCEfWrYn4hU-GW|{ zVN+tDRzpVJ1V3WFo~)}9S4wN=_!JwjP3;Ox>>-8iw5qq&*hSRJ5l+$fWT?qd<&vXA zv6)VG$17FaL}VUr^4XKGHv=N^q@mcrzXotfl!8^?=jZ+JqF~($=y17lV5YJ;6bg= zagymXM5{8`+Kb~H4jIW@5!Z+%!MBHH#)}_@o$4$aD_EosWTq?zhU3y*I%*6RS4NHK zwXbN?J+jMLptmqHB(qG>;eMOi>oK5;T)H3Bj#RHJo{H{qb%cDike{XM zDVoq>!c9y`JM&j41Xkh3#N#fXtOTPuf0A=+S6%sk#$s)1i?oD4kJ;$^?ycG$-c_{F ziFV|xSW2x&;P}ld%rlP|b!}>Zmi$&So>iM&D=>-+)PV%{3^K^2^UzuzJ2o?MXXK)v zkh{j0w0c*B(Bp5E^EMbGX3=uW3>BGGjKi>G{UtM$7tO!a)LYcX0t0Z4%Avdhp8~!3 zX=)?m*IA^3vQJ;w#!5B4ScDeSmakzrV$E??rWG;BEq=;IETf&VB3!%Z^YgZy^ReMB zjB8y?TCue_Mn6YUjh)b%OsnlhWZwkC@jSISDtfV4i^_!|8q#9$6Qqcuyv#P&t`wYF zTD{hZ+NQ0YT2W>&zrj1r90Iox_!Z9#e-%$W8J})OyoI`pD5T46*Q|&@rdRy}r*mIs zoZshJEv9YvbQ+Y7Z&FMB(c@C_F4a|=r#963SM@im0fzjunR=S5u9*lWkA6WEU{HuH z__J8I2%}_x7C`?7=yZ=sz?Db#n;H7nHs`6(Hb--`P^;Rgm&;*`8mvTqi%LJAh@Geg z5zF&cOlys|sXkY~JlZ`WNbVIiI-d44N5fUAz&R~4FA5*iUcdC&IjQbn4`s|QC zM4>b5nIQoWSpTRyt_x`(2gT<2LHvHNfQIx66-N&SP#9x0ggu&bXQA)Wjh1nHJl3iD z5!k1uo7z?EJI(O{74K71jAj%-zDFhgp3qKJeYSa|(!L{I8UIxZqdbvT+7q#LxxnvK zZ$qP&vudMQs#$f7$rLJtQ6+yF;>Pz53u2#t($boY3U>YkrM;F(G^uSOhbQR%E~V8~ z?3tEvG>K(aasz6Xbz1XAd3KiSv)oT>TCwQB2^rgwWK42V+A@x#5z`doB$O~y2@dwK~~!lXcmFc`2P z*;+^uHVL&}Eada!$xo)H+dK?xWSd^Iy#QZ?S%A5SM6s~m=C#+CVp?G6F%&|qSH4;L z48$jrCxOpA61&ffIF-n5cq?rx)|!~s!~!mOa+K=}EAHmz&Fq`jy{TR=TB&BVJU$tM<^d-&oS%2A|+O2jr zj8K$saGUBIuY89Y=Zj88fEZ4Nxk$sjs5W zA?nwQ+}~#o#F=pjQ!zy+bZ^qOBBpEn4QvG8%PMUaS5mt?{gHxig^VoZbA5hUbcb)~ zzBK@YIvPTHZ&KQTX)U{qWqOBc)0;1(iHiqE5CpG8jy_)NI!|SJR!jJZ4~xC%KRZ zv2PG-Hlh%dx4%+uQlH7&nAkk`QYvv*E5jKvsax;iZaC~<{m6@q{rqG`bzuf*3zcq1 zm}ix9RkBrSr(t^V2F!`dB=2NsVzSmVw&D67Rz6>6R&392*m}hfOJs?kM+r0e&<;hU zA1z)eGSbBy#3_(|!7@RS^CyloTPq(=HmUTJsn=#>VR&MR;RRvOaemlR4IfO$y5w)P z91e~CBt6|O!!^`-v6QPY6U`96L<~^F4{PC>Z39K!O*V~-VHlLW8&p4=&#IaB5}((Z z{a|?vtpMp_nY3!2(`#4SksZ4Omv41xQ-5xv*U+aj^$t$j!;TG@6cOPOf-{a^7GH`( z{zOtucggVOj5!eHB>oX#>Oz3&gRn&N&1xMFB09^|PgYwQ2to9>=G!p7c}d=4FKN#P zKTO82ilc^@wUL7ol)k~Z7thDi67Nz6+Qo7qkc0D#qC_6Bizkozn8dllmmZKQ6mqsS zjhILtLy2}_Ut$t`RFsiC72kv}D%vzE8=XPAPUf0-ezVr53OP%!L`0FcF^8Fk1*=;P z7VO}o=>fdWZnYbitCpi5gRoDHyA0ThV}H46pL{`ChC;-NelLlJS#SAeXCPhgke8wf ze^P!V3hc7TcFeu7H0tORwJmCCY4p#{_wkeg}*te|?>dJ9Y2W@q(eTl}*vSY+LmjV;9 zbt_kn?PB^yLXJM~Yu@)`1GPxuy{ks#HzmF(USw4}+eD{ve=E9Pwqz9W5X#0%H)_&ZS zY*rE{A>!Amo)(ujEEUC_2(XZ1=ft%g-HA5-VPZKf%`d@^2eeJHCvwnd=LKy^q_0)m zIpsoy#{RY1hkm8A0zxwt)84H0qk31hFiK+2w9k+&E$}I^du3yNG2LbNG@;V8hnlor zEq0I6uIlcs+Lvp?oX-Cuv0PL`SVG-}_^Ryqc=c>*^2B${&iHMzQ?bY7@L+r&k7cgG z5B+kn)}~CH0^~!x>Z^snR?`UyG+`s*xR+4%yyCh)R_VF32gB0%`&8U&!aTVUlPN5n z6pJ@gq+v5y!*E?9u}}@*!Jd=oD}@_dg0O^@9+pO$;_DO>s)R)vm9!8~Ji_hxvO17L zC>A+eQT$;k!fnm@HT0SGp2TBvplFQ}eZZ_}Q;8n!J&`o#TXBh(InLx9l`W?*g6TN} zOXzJ9WPDrf){0DkDn?~k>PaOsD)!Irc)Lp6p-m?_VW7Ga0J@@+kzQt5xBwApH3 zfUSZriA9E8_oe3bc{Jwdl3V2c`R>-N`M!3nH9WnLSvET{$m2iNjp$P+Vv?8bjjeSR zV(ikw47U*R9ZV;fJ7YMUmSwi%l}PL}Bz(jJktCu)q5g%A_|+})cABO)QE%mn2}m0y z3EA*9QBl5+fsIs+CAZKSW7aF$AlLuTbo`fzY;BfWFOBz1nmGs85u-@kXjWsvi^MM1>5;)-nDX`)5|(mgC3S5M(oatSbxZTxGkxvieTp(I*MfD`T87-mcXy<(o&wx1oJJ=>SKZ`iChnWBH2RbnJ55uHvo z$gmw$&y~eptrW24M8-bQ%vE=Ba`9PeIwrP)PS8doYNJ?6qs!axB{(TAPelUzFKhKw zB}72-^sqA^<|^L7(w5YQBQNA=WdA3XN~g2N9rIQblj>H)I7(Kf+^LSq>um}#FB+)U zrIJ5NC%39?h<>c8z+9r@t~wK;I_jtDNraYBWJrm2mIfNF>g5$$JuR^z4sx9lmuc2y zOPn_w`(+k~&L8uWQn)gafPgM#Xf?NgzA2;<5TH zf@-#{SMl_Q^=jFq#c?KZg|Q3>W?M9mb?)2)F?ou-NB`@@YH=6SN`mxUnS?YMCG^}x zzT$Ao!9DG<4sHrlvv8~?I4m=mi;~?5{>;h?^Xmz|XF|Lq+3*?pc(1DWP2em~B5NWn zg-PFAqu2Ce!!FcoHDUo%VzEkaBh;rDj|hrWW=$)eSOIaVG3^NG7NLkLBq z^lGw8Xt1;pHKlg)RR$p{AOE|ajl}%USC9<+lu^Q#GBLjiC42HV-{U;mMS{ucnxlqa z9`$^Yo)r=eri7)^>a&ELz1jfJ=U}R`$ae73+t{j_=lTJqxCr8Ew4i`c)z@vO_zv!A$vIHQW4PW{4+`#CP*3XU~uwR`J)< z>v*2YOvl-XT=0GQOa+6$oJ>xykkpPTVfCVJ^MHxVPhgNTs>jOg5nV8Op+wY?oJk3b zppmr8>DQ>~Y^j3Jg=Imo!!gOJNvxAy3765#j`Qw02dO=X2#fFxzYmL#HeG;@GmT(~ z79(>;bX!8#?3;{XzR4r1fdkDBA%ivCAA%F4g*ip9Q1Ur%q6mB{Bu9tSBEL_NtSA8& zZ_H1sy(L0mzAax^x?1UnjlNEnvc`Hn?*ug+XD(oey5d!qnU$FQ^VOMh zA#I8j6<=Fk!G2Gv{WvE0Dm+Ex`IQ%w@y$&$8JWprrNh$d5Lo^>>0ClzU4 zhuVy%B@PFr zHg$~z;4PdF>qx53Xzbd2wIs0bP}(J}i+yjgS}%H~k18lOn5uWHof9Op8GO;iKeE$H zsyT!k56A}+ayl^`T9EV}CWkzwLxV7v8_(sKcxK+nk=A{vZ705oQ8W-~GAqbR6FU<( z5#O7(X173FG4c_HBnx$y{*g$I?ftf=@p53EqxJ+QZXD{v_lVs}My5}#m(O0VMlds+ zi_~)=_<=V88?jY#2rGGsu7%| z$xEriifZJ>D!5Y7JcYHMjmY-JIZ7vE-R3w`EdQJ}lYE`7$C<@4DOncb4o+Tv$_l=z zh~6XUEiPNygF`Q=`_ULgpt)WvBIq+RR>7cEJ4qQ}`jct7s&6_j9$(HPbAyjD;rpbz zu1Rep2=1Bm-CgO>B1FsXSm!qKdwi2U_6}8_>lBDeG;)1jL53&ual*6=e%I??#RE-C z{dC*NXt9+WSiPeU_#=-__2F3+DfIZot=%| zqja)HQDD8=wz(0L8YZ_YiY?IjQif=V0ibyDXM5i%7k*afkqRv_ib=oX_QNflmeQ~}tM4Z$ud6gbkGtoPZ(rbRfB z3E~V2q>iWpv6)tqfiu|)EqudXQG#gzNFFq^_oo*jXsO z73Aarvq<7!Twq|nOY0BJva9ap0vj5MnzK#_0hK1q<)Tpf<$w<49Ci<7QIS`BuUa| zd*4Aj5IA0{Hia%1zgZ1VSk4W{#-2$?q>}Mtd|SmkC^K>9qnwGzo~tb}tYD-Ib#ZYl)))5k~3;|Nc^e9Q{Oy$PL`m> z7O^I*X4O04G^ue?O1OXJR?4Hlv}BXK&3*F7UGf#g*&_3=>R^mhLu_#!bYaMo%9+je zI8R;MWPZ9d;aYihmcsVJd@BYgChAQb5$mf_#Ku0ufS^52!R-0k!?B|)!J8C)xMjo0 zxfA?ZgR6Obck@Bboy-wXHYK>#DDY}m$py*8B9)r2GCyeAF0Y9Nz7Vg2s3x|)sh_3x zkuxIkk6yu4(qBr>4EQ;6LzucgQ>`MiER@{{?=6snc=e*$BneJ&QH>W-Xof+hh=>q* zc2wKAZ1C>-lrce-T`{U1JKdi1nPoX&tRk|AA!?tmD^;lIVpe>qtIyWESg?SC=%T)y z!Xi?p8hK9oh_Qk=!iDG98KR7FpKo2ZE#xd5Vh{V8#@SlcoIjj;mhkE*GmGFv#O_f? zO=lWKP@Yw+5FqvnqU$*e&$_hjwWY`)u8L>$Gm@Qd!!hxFGKmB0)~=NZ1(iPJYw>|5 zWKWKKUF-ErJDNW#;R|_wlQzyCLoeT?V!IY^L$Jwl_qCS$WMc)z3i3T54jIZGzeZ%k zGJmOMNo+64`yWo#+c3uwgl;7I1)+qC#1>;5O^{of&PkrqrcmWwVr4Q6=M1!}xfrUc zUvozmwU#=Hlhid*K46hJbr!2L5I5!P(xkC*p>zI$#r3pFypMt^3~AEGT&eU#7(SFC zfe`;G)(p>(;wiN`z(*mgONZh_8KmCvoo1`_i_O|vC20)w?vZK(+TH3`dn%|KJR(%} z7qxrY!)`-v5_@Nio=j5Px8^sbaO?ToI{b#`N{~#Fz_d+ui6}E zW*x0^lhP@ZE$UV}Y*Z|d(IVEhHp5MaT2)_h9Esb~GQ#&y{bDtWB*IUYFu55~6 z#3San%Z)0fkcu}0cR^y=*jChyptlj^Pq<}qa77NGM>A@G@X`6L>*e|HkLk67r;eRg z1ik1fR1|LuZ4AW$N|%~Q=v9MxYCE}4DdY*~V%?xbiIoz>lXsGRV#7IK)590FDT=iu zIM$|iIyIA=1_ey>)h2#5XWxSh$3&N`@f=3GoFCa*d6ayG6YJ=ku;;1!CLCgr)UZ(2 z;@s0tu(}_s;F)+`qh&ZaWjoaRAkw2W>E=i$3ui`tTkJMD7*S8gt1oo-m@ViwZ%ndB ztQv9t^KR=498ePV7z`}>&@N{*;E$QB>fcJNlFcG3Fc;DHX_ARafTg=vch z#sk41VUS}w+j@uEff3XQzoANga%_UfT5IfNWrh_-B>|%u$|14Mbn;I%wuqB0u^m1& z4pyURp^JgjTL?}hq)%a0r9hdD_ef?&F|QMG*gc|v&f$R+Vux6&tU(M|0HpNch8 z>M!$o!cPU}jkrBWq=yI(WYktMt%wL}YkR1d>aCPes8s)AFpVLs9jWi)qYf3kV=oa)xL053Vr=hD*lkA!GEkqo$jT(QuSd_Gn3LGb~LOrwS=Wz9?nL-8fh{qaH^C7m7)PJGBB(KhlBtFbB*)NZd zRm%2vvm?YBgOk@jEr$?L`G%p?#v-$_9Jx~bOez!^`lU{_UXrr=o8u|9$CnzNs?B`D zR((=CQ|yrUYvd3&HA-8*y0p)u;%C1E*)4++DCJVtG3p8q(3FPNs~!;djtjO#*7hkp zHv-j>{}E;J9wzQ8)mKK1bXk^qEhcs}4T1cZY(AH_RqKnYT`E)5R*F_pBjX(3sW^|E zo#7T0oz6;I75T!pNsEjkcrmdDb;G;N}|6+5o!$!mSaR8H!DhQ zdIJJ+$tP#a&k#jns3a;SJQ*g1O)HUHq-v_vmY{~A-qZ+^@DdpbyUSF-gK|@hFpgMClyk#9!*Grzpj) zg|11_G3DB(l=-rAd9Su=@qSWChKK|L>Kyz1?8R2NbJjKN61=eub7?Y0bYZpB+{M4E zT~6@;D-QIispDQA%LoYR?I@v+*1h^XT8s!*@TbApBFQ8=*e-VCn6oRkDGF(Z-dXjg z3S%Fu;c(TH>{1L*s=#Scd+~sf$nuR5k>(MsAA)eX9Pv5+D3vOs20~FH6fB~LUBeg~ zFbt_^_b~P+42k$ly_zJ!DR3rcOEid2h=D?AaMiqV*(+9ZSW4u1!|0QWBlN0aO?C6T zoQ40R*1kMCV?3;iZYr6{nU<((ST4HLWll5NYb=Ps%tMq$-)ov)AkRciGkXvMMo4oS z`5b?Q=w-=)!7i_t*gVSZM|s^b@}W^E6Y-$Xp8zATl2~Jxb0>O$lOsZgY8Jw-hzT*5 zG4^pJ``11urXIpwCaa6o!sAdv5E1}MIe6axzae-VM%9OGwa*YSLD}yILD7sNICe=nJf` zb&imG3TLT<$%cSHBT`|_y93Y9)lh=O+^X^vBgmQ@V=t87t=5vNHwu26)QeQF>coa_ zlpquHgs3jLUkQ-5s11^fXjMZLK`mEVDYhFD^9voHTcSq9_@|VPv=q7>`6ZvO^btC- zSczrjc}miazxxv-QV`ojXod)<=smo+>W5X zP~qfVLSpv&WGR!p10074V?(WEol8>N2s<)x{OD3b(WtCjspbxJJVByMygx z*(%Y|T~{2Zn98meG4*Y;@Mgm^uCL3%ABo60L#Ma^{?KY0Gzze2cEB=&9QK?c-^KT<(hklgg zpDo>irPs(=5#^(%UaHMeAtyA&mMQIyBjXY+G)lb`_4^Zzgp==H-}YJ;Vi>!Y@RJzJ zk`H6w#nfXXIdPAG2KjU10mM+z!}w=i!8;ZkVr@8uE1TjEYN%ywWg@h(6OtQb#RRrf zEk)V%ICW2D`hQ3rq;?(K0SU`{B^XQ2|C*R~Rcya=IJk(##9EKws&t*N^T$Lfk}f2# z7*LN;@8M;BjuaVZAZN(RsdYS}okc;*dndeywJUg=Bosz3YZ_wHs^TBlrdL#Q5;iql z;-<@53E!Y>=QgRs{8PcWS80-^OWv-GaFJH z3^Cck2*UJ7#j&4N2hvr__t`HL34X{MXft_iug~@yD<=agc^egu9A%Hy4xEcQMR-Pp zxZl!N??eq!<=__>zpMul2ie0Yz-L77`nL1KxGvs|9u9RI3yWeG5)u6jvFB$M6vun9)sv4_QeWJ ztH(ztWlyM&@ryupcC$06VO=!xh_1e)a`=heZZN*7P+ac+O)HO6V z&$x!wO6yby<5JEhm&h9kk5PabkvOri&J zjbCmoNn9cM^+7@nJ(T%stEsn$QddyINQs?3aDJRGN0|7JsRtWDNW_;EXZNXhRT&+{ zc6AlMfoAb>yU)fb^68ACil3)GA~Q`YrZ=dwSmak&rPf6Gs9og{CH(XoN8C%0(!guv zr+jWuKg8oLO8a8GpHSA1SU*rEB+jqN8e#0R(j&3;QdT*PN5d=`eYqK7k>j_?ZhTJD zj%Em`+y9@c%TrlnfkIt?NAq&J9>YbG)T1PG5mQK*f!IzIK!}Is>eJdzd=$FE%Ug?* zr27#ZvXId<)MH*={QknIWHZWVojxrFJ{l<^m{4LRO1?|=I{LTc0oEI)n7OSpA(f_> zUo8wgF3wxk)fKIl8tMn+OQjB8g4wO=#={lla_q4-Vqlvwhh6uZD!u??K?(!cNv9ue z+iS_cQ~BPSJVDCW%uUJI2?Q*WjIHWkSw4ZKPB=~{{M8#&-q%WWl@$lhZ?$@>tUQpF z4}_8sB8-WGnCK2=XvIe~S^bIqmhWtD?@}X9zF?f!iHV8OLT4Ok_KCD$sfag2(JNZW zFE1HZd3Aa<%%lFl&043bb&O~6pN^_YsTP60o@&ocVWQ_ou*g0kv3Vx10S6$1}60ea< zn5CBGeF|guccxBLh)&~EvE&uAj)}sv~E7?Pa z`(*T4wweERd>b`AvQor+vh7)^j$+JUKuFT_nFYuj+Mro|Rz`)SdxJAZv@4hOhW^{* z2V(dM)SVW^DxpHj>|d9bwH;wRd8;8IZxYQpzf1^4<0Nnvq%&&NlY~YWzWP+Hv4k+!l|8&ZRFWZtlUmf~#=8DFDmI)P z5!+>c$Gc?{p2(^|Pu9Mc%QB+(A48-sR$7nqzPVvBNa8M^FCLibwxS=T7e5#KpgvnHi=$?KucvTAV=vvV6F6DK~f_N*Sa4@V&h zJjA7WP>_=DCi2Y8w>r4*4lx*JL!)T5Ohi*_X&%Ib>mgq9Yi+R=5FBPGU$2%})|R+- zva-sqytg-XpyO~MKhnNL5hGPw@?ZqS`=m$>fxml#G?FCWjNq)~PRubWwsY+z+ImC+ z_Hg2=)Gkp#6fPK)#MIjxtJe8A%R)GOcuggZ?8*uN>$7dUB{YolL$GUGf=uBsu?@!9 z=}8uDq8u;^SWxYwN?tI* zzu%&5P_ch?#LuUcn8LD#D-C|V`=%okIz zaAgg`$bfo91k8)5KdJha2>Z~2gox}ZgN_@yYh6eb}(@( zzKxhRvfRH(+e=i9_seq4+^gLqvOAgTt(1!_GRCQ-IT)c>QW2FXp>e^HyW~iXAY`yh z+a5VI4nyP!m1;hQdh7W1X3D)YN2v8<{+K^o9j-FzQ`5;t-HBdAE+VOMpT)=6GmmWe zF4Dl50_eNfkIm+v$<*hti@#1-gyq7M!^R~39_wLzBrSPMqftLY&1sW5qg?Sy+_k~k{EJ~TA9u_=sto_go9;Wl~I#8 z^c0HLn9AFO8(76?Od+BIiG!GOLz6+V;W=QLgG6|Vl7$4ep6RO3#UGTwHP4D95~*sG z<@*tjjdgTvaU5ADi(l9yK_?dF<7ToFEy4`pJFrZtKw&Tu`H=XOm#rtL3y9?Mh=Y@w z;?4uIG$Zkag`}-Ih&xh-L@^$=Xo?gBgD~K0T&izS<`RtytW#TdV($`3q%H10B9$aJ zZ&l?^60){;Z9!)Z*bc(6e9SX$KTaF#zqRolOmu4Y&PhpB3on;OW$wt%<+HSr z#75b>xofv<=n+Ei1L|H0Ny!1c?ovkWv7Q*Q@~PPtDu zlJ1o4wXs{-5FjHm6PSxN(`!^;kp*hwr)ADdtb*bu-*a$dOZ7MCF-xsr~Xq`l=C9 z(@#+M;?U8Ga@eyjAlqm~h?>pnMV8E>2<14d5r~EK40Nm6gIL~Us^rYf0;%|~H;Z9T%~i3i*G!r~JPFwr{kzfVvPVu% zD%qtZuLHxW;9xuE(oqMS#m~3Rs3t zN#jCgi$R;mzD+KY>K%rxj*1uUU}E#rCIo<^<21>Dva;Wz)@6SMAA%RD%=IU1>MZ(6 zxPC-Uw<4~GS1}nT_z8?gwtgOWNp|*T`^nNgF+9`71_3A4=S(^ABN*p)#60H!0RmJ1 zQHm(8UYioI-(*wfTxwI)O-!1({1hJK;pLWrwIHEg)3nvhg`>uW$L`BbR?QqQPnmwH zbo!cMuA+mRAEmU3<^j_fP0ZQM95;uVMfMu^qhKfht#&P`;zw)sgd}@8LdG{&8x6e*e|RHT zCU?rQf|Le!t8I;>1ULPmf~Ha3-pm~!{b&Ul^C-CC$lT%E%%oJ0zB_>_zvI77Qq-wT zLPgYm5FW{jf#DvhE+kS_w2o6lte-twCC*au6)L4CwGXH>WM%IHb%DgyDWK}F>c;JJ z>qqkCL_;KHm*;`{Gt>u`KC?rXfN;nbpPhOJssK+<^b@1l+I}yG37^YlJGG_9eU))8 zE5;D8#6#@G0ket&00_xGGMtr3FcJ@hBHYoO8~cn@~1L(v2= z6uFZR$0NDxOF|OWJi!6OeIvF-T78isOpw~$c5tDXCpCW5=PfR^hTI{AxhVUGYv0gn zJi)G$XyWJl$ba>q{;t7WF&R@q*pKHQKAhBf4+*lxZJQ{=x4c8C>7)o~1_K|by(GfI z+fzys?HK0-weI(1fx4V$*)N~-&c&A0@0=Y=u!wh*5O*K9a`C;-re|}q)2p~`ghcV= z%~6pyKooLkC}e)b9ul)*6p@=og5{-I$ZIgQal&N5Cm(F*{tj#-|gdmo_J1t7uIX}V0=5v()=>Fb?EJmcM@RZ1=qKi z#I~@iE5JA1fZ!v^D&>i@MhW?%gdFqm4W{NRZYF?nvVjG9c(facs-RNiZ^bqvi@e%` zHhjU}sQ9+t{d_4xsz#*-%88pfw~gy zMn_HH4PZrB)eYLr!m6wWGs$ng>vhC#7>|E08H6%5L_@{0Fl<3nmAd!fR6MLw_liUoQOhrQ?rb_aJ7&MM2(Pvrfd$C538;wvnZ^7lj zk&ITgOgGsqpg}88cq6eWI-fo6s8t-qj7ABw!WTykQNV<@$*$oi4vgB~tRq;QwTJE; z!!u%uCq)YqKDNTY&5yBaSwh*(>am8g{olnts1TM;a!Uxk+@=;gl{rX%>}9nTxn~ph z+?JOL7{>7H2{Ik%uc=7OR=;*ceHk_x1`Z))iHS3N$$8q`htIU-Z`5iO^0C)gOvo0$ zlXLdHlVj6I*A6cu;V2dqgPf&Z2)r@mlR0Md=VD_RY#aAV@Lh8(G)u<6m-LzLWkXiY zq}AO*`?TgpCN+7ac`eh+nV#|Qru}@GXAbk6^8BKG!O1i8#R+C2d1j-=Tb5d}Pcn(3 zB5Q(OPFGeNVQKuJwcaA-X1oK9g)k#dCs|PD9GWGiwo3sSH+%_ijSpeLqt`Qy(m`G& zE+wnxagMMT3M@zyL)tg}M0}9BUf+BaR|vtV?sOLmw~Oc{#K< zIl$vd@o{;Ajr0%gv4U8>JQX*Q8!_Q=upj7a_Om@)c`83!NT_@92*keg35a3(CT>2) z;5ak+BDG!a^;9D3w!XXRTM|W%tLAx8uhP6l*YIIcT%NUO;7XQ%A^2?LlAeUrcUGwe zXHfF%oC;%&><>bHFq30FJM#u#x^QUG2nBu7SW7)lEQ|(|?%;sj6aZJc-XvMlF-%GH znJX_4^h9X~W^Y`ZNo?bQxGvR|gi<&xLV>o$+%g$&K4^qb=F==5(~SNV|-WQ>2~k_6y)rA- zb9k7zI^vE@!>JTxP*fi1nNBHyBpBYGA%#OM)O=~qV%K_dSv}6H6UcAOiICt-nc4dN zww;JoYBou9t!(Wc_2T_J|Is3?M`eu%wJb%mv18P;*o^23vi9_r<&CsV$EY(zatC%^ zToASh(UGaCntADx-7H5eOF@AdoIGo%$(;d+Fq+mA@d0)}*EG(f+%y1FFR4}|)OU5B z#i7uS?UD_yPSHA676Bj-{ScNwRS}^fKnju0yPu!sL5mPE-R|IrwW%-JC&4a zXohE{vz2ePOrKy<wimTj;Ug#N=!xfA))1Z^7KWV1&C@6H&l2RmH<+Z+ciYBJmuH*bNOybafWh`#n^HuQnDC2H%zsLR$J>< zS%h1;cL*D=hzBi)BLNN^gXPxjJtHc_!G>TQIgHY{Vr zk%ZfIR)?{Mje*1Wy{d6P!aTR@;+_w3%cjjN@vnHtX0w(NpI%HZAI(5ABPvqcPce~) z#U@gONiWNZQ2s=%9UV`HKirLcd;5NdfKTAhD0Nby>&7)0|od;Y^|tRGRqI!I0P${Ua$=xo?*p{j~M9B<>=srNo+HLz09? zD#%`cJJYCa70RrqAf9T`hz!le?X7`g(_ib$Yr=vmRUV(~HU z#dT=}qS+zhDKT8!>wYBO{MOV$6 zHzT~MOXb}Pg{{o+rZ{LA3-uk8R7)uTP8GXEQuO?M&i}~yvkE=RtYSDA&JN0x8>LAL z)K)Cr3hAg}?jZ2ov6N?Ow=85C&3gICmCg6oEB#V$4u4ELYumFor zPD)~N)K`(QLdsAMf$#BswUgJ2)Rp;O3;)stW>3MRa%Bi@+eb{?J70B`37&bG1O3u8 zlv1F{umdGJO~I8%x`L7q@=iXc1sX)k;Tg9cw|tZ=gLs~0kuuE2qx$O23@SsWi}(^4 z{g~4zYZkJ&%bRY)b|*Z6m1I+hUO%1%MRG(kTdbPLg2z7K0FD<0smxf;kf&~mnL*N) zyeOA|-PtfbOMMi<_WI7DMArYpqG=_Exk+!4pO$qjA39luUY;=E3>O)k=Pm8Rmv}H4mdY(TI}yG@l|INATHp?@43_Ss{v4_5L)Q+OWl8 z`L6hLo>&gy;ktf_mBwC(;n->C&%z;_ZlU>PvI<-f@`%8%#9j*+(VItUwh|4HKy4vN zDU8A_&12bcE6#VuE|NhRcEqLZZ#0?*aQn;>79j5kb zDz+yIkCoth%CHTH>Faru~t* z8^xV#_A+{U$DQl#WLSAY+Q(qENGpWMVNTnv+l_ zoro!b>qZd8$0=|SzhQap!0Od(5TuL<0#nRI%<-}}F!86>J!Nh&#@)X7vX|1mR``vc zy}ZU%V5R0+?ar-a_5PvQ?=yR(k@0j8C1=E?q2Am5JkwBan1$^VARMS*_)R+2F`RYp zXeVyP&90?(8;LLX{Jda+&eji|KP37Y*5|3%!34oQZjQ`Alq3NLr;Ix5cRjm_IP=03 zT4r1PUe5aP{b~r#BryCeXIFZ0RQq3TCSDn`Dzv*r>1Lz|BHB@E^!%_XFj;npOZCVgfuXeio24xPD-8Zq}4m6GfA1B&jHaSQS7ApEb1p&-!eG{MB%vgC;0*t9b2jch*nK{{#ncSQO+>Q$54P+ z@!K-nCPmp|zx};$JJJ}TVp2V`jhRfPni2gzxt_Z>SVQsTqJci$`DXe+5|NG0VhOY( z(urFom`;rvlY3UICE(GaYE;!o&3?DkWma!Y_vY=@bGXwODvwgGk0(v4)e)ILIEgCf ztlB2`*~^oxTD5Q!alf4NxZ26cwWhGJon&k<-#@r#PCnm1yei+npm|fipYZ;Z>6uRP zG4qlDrUu~fP3QB67gorsIiJK!Jt-L^DG*?y`|f`$1v2VjlN`hlu>Hn;TXhPr16JzG zf{gn7fJkeJUnYFpSg6i>S&2mwqTztBFZJkB;Ab77#;G_Ghv3hQBf5*5k$N#cC}jnxfdQLa*QD_gnAV+a4lri!1_u}AERJmo|VGW_U5 zg^~7sZ5uPN#vOK?^I0DyD;<27hzYCN9G;vI+s~sE1&dw4pl(NZ5H*}VdX{z}$HUr$ zh9u=*P@pV8`$k7MTDxT}HIFKG(^2syYHOpiYfLS1jFZ$7mwC0g?5yDKr?k`jBNIoc zt&O10d5OgTA?iKABsr^d|F5dk^z@{jv^mTMZIrYVHc3J-8{X>j0iomt6;#szqe-P z`#dvUUsb2-sxO@Lp7*?GqZtM5JZgqXQor9mPEd^TM0zy(B=>qVv{*{voea5!cSrAa zxFkMpB7#^2(`C$Tm~?-()pAyz;t8?2xWcY?EsWNG35_GblI&kPzXX(=xOYw)d?bRM z`2RxAS!~%be~V`C{Zr&100P13^{)3gl4W%w3e0wdn*(LiZB^GhjTY zq4kPZJ^dHm)6%3MApPE8@WgVmI|8>^f$g@n$?*FM@FVuoZL)tpIKhZ$vc_$2nV0Z#^voi&5io~q4*;zcg*}@Du zF)*5yg9;XZF2>8nZsn^zo}S6V81Q|DY~hW27s7s^8Wl$@bS5HLw9NjQ(}B}`b)y8s zOE`Y@S2wlR;^tySQ+q@GD+T&Zr-P%DA_xtif=PrfP6x^IDcR)k2#IspQA4}yP@%2i zy*fCuLywI6%wdRWNp+7Xv$rzZH;4|#dhl@IG!6vbSDCM)&+R6$FJD+quvd6`s<;xa zKAmb&LBH58wp0@ZtP)*`^nfs1B8lZ@k&W1JB~59s4PmCp`YldwPS=K)~50Z_LoL#hKmJ#%PJ0Y@05SK9k8+`Xa5aLY>uW zQM^bEf9^mv>rY_W9k58dS80%<(d$3T6m)dd#}H zxo-jYmq@FSpd6`p1qhT3c0yl}dG2mIg!h*(tq-5lmaD_2n0>x?ClF@TAOii=6`X8)*0)| z=15 z#q3|Hldh7+yyV29=CmK26(%fWEIv4^wmEnXF1Ew%DpFtK*= zUCRxC3$fE=0rmLkjKorMdP5mhAjKuBDT8~s&A>9i!a9URR?Ynsj*e`R!iShQfg0f) z`m&jLWPrr*QP?A{&$lJMyB2xI`yrS3NpH<<=4s`^F&5bmQz8XL-X4*f&Qk{HXk2MksuA}LG*jDlgYl$By(|3~B_buz| zvg!S*H}!Jjs!jeGH$G^829E`wPay|*Zjh==d+g9gnbSSd*+){zWE1VDR~ESb7y) zIId}fi8kxZsx!<=$lf<5tB^ygGyDO9tAD7QlqeO$54$n`CCm~IaM~)FF}@fqf*$^u zj=FFISsXVJ90oB3T&r*to|xXkG(!A`8#9uvsVE9+lftM#^fbT7ohz16U?VGu|HQ$z zz>7@#k%Z$g?!bZyf3^_gVP4>nrsky01V^ zUgjIah|z0|-386C1c#>qa&v)Z>h$FSmXcGr&0NN0S+CExJ)p;FW}#Z<@k3wAj_Rlu zPv(#d#*mpoz^K#omTs~F7?*GFjMsi0TUSV7vhXpit-QK|==VOH%*>Zfa8HOFuy5Ld zlO(6EMnVz&pYuuGyz;H%hj?`JFLbxsFc;Ov48^*J2$RnYkJ4+N>kd7+dBH4_NXe_7 zNr>*nqKMgED|DD+%9SBrpU$pWPSS4R!N5`6B2FbsZcZzTR}Ac@5d9(wf=`R9?0Tc& zBQ=NRGHfiDMIHEDs_GL7Wg5jy^eY`CuSLGq(HY0RPo?=V&>fYdsZd}glLb3<>)-A_ zo%QzY%6cizAnTQWYPCAOa@G?|Ka1eR$e13fJ(EGi8`|kbG1X{#uqRHW5fVIeB}emQ zataMUv(kF!g?Q8xFc0dbrD1z5iIJJRyB30ans^X>mMaARoHe7*oLw%LnXj1wUoDlj zos~oxo~G=jNvbxe=7k6v=Pqmod4Zl_gea#~ObU;%{77`4Fr>BpZT%EJT%?cah1tWKOv*dxcz3k3Q4lKc)LTn)#2(1P=Y`T z=N0nFjU;nBcZ_^-eap`6q)-+MB4KA{0meN)LV5nFU|=Yh?CUxkQ0*GkO=CWG$dfK6 z3(bqX1a1TCRX1AhKK%%{oNzT@4uJ*CC^ctcERE~{L?LT-jY-dvp*+hd+7FRoc*sof zVVTkGFpk)c#mwMN{&(}ght#*dG1Ko}Yx>-)_&>=1K0KCPTcEqqKEQM1B&eiw-!ntA z&DSqI!?$tDq_gr1CQF;G=LqdO+QII|dx+wV3L0j%hYWjfcwC^>8W9q~ut_0EmY)s* zSN0L3vM*UWF5G?xQExy~)f{*_u$a6+igX?pun?~vb4I!bY0^j+^J77xgWdB%eIvaP z*!;EhDU?{AQ3jfxak3C`uP1+oQ3LQfWI~GkG05PKv;bcpap zqtEnJe@D*SVxI0O$wHhn+N7k3CQB=5OM~Q6jpMJ9hzj(}y6kBn$UhM>#-dljpqEH*B5g zopf;`E>8UhsmlUYd6$~N3uc1XRiW(fSBZ=HX8ICc%Y*fnZ)yp1WkJW6FMfmYun0Ubtm=ijH)4?-k@m!lzB9a3- zq92X9NVjvndjr;^>sujWNjUNX!6nP+`BH;rGa*#4`qu}QGoz33cnc^EIg^!kd%@6A znu=qv5d}D6+OD_AMhg6TJ)IupyT+j~S(aoXQwxeu=PvOplg_qj%D`YpJO66&Da<2K z7=X{20CTCizB{J998h~M6}q4m&ZTDYHDC$LJq}tc0do+77)f3+X$0*666fbycH*1I#ZeDe-8B6Pb01n`<56v zeU-+_dxtsUZ-;>01K0I205U$F4hPcZyi`r3kTHjq{E|euZZrxZR>dpI_+A9DAVsk5 zF_V?iz(y5oVmr*R5k5PLNX~gVXM|M`@lCGx2Lk8>C(aX`aB^8vQibZrH;vK$%e9;a zct@f3!@z=-VIEyngmRl6#~Y(0Qz4*JPd8iUy9l|F{3PaUq5vgeKH}obDH@WKVxDYs zrvF^-+wF&diAt1fzcas+2pVui$3Ux-1YH*gmq4$dlL+C)7vhn1R_Zm!3951tnXNhR zb(5(G2`IADFjs{ER0bvFc3hpi=;tM&^AU5oI4oQci3qvz#|sNp?tLK}BzsUMokDTq zN3n!MA#e#U^_pp+37qH5mOFV`vK8qw7w=Q^+0aoQoh;MGO&}DUE;9+Ze=1iX-IHLa z;v6iUT%zyT^Ieo*$x!j8T$N9N-^RK_PLw6SyWmsmC!;>3pF{nB_i)C%~rB1|o~;SFt+`5$*b z7LLs6P-2tQeTLMY86`%V`8>0`&%~ z>ZM|wLaY zT$27MzFS`%sxe*vqeP5(e*=>&M^ZoKwI~+@M8*~yj>nmMSs)OH3!7&&(UQTe|0L6S zJac!HWL}{%=nRqPFVzk%62_|^x>gJB04|Di#NMY@xs)xDxMrQRW)MS-bLj%5T-7I6 zR_os+^eAB_TwJaZGk!y_Ddr$!{fIiN!{$ zN~$w9-v_*it18T#xA9W?Gb(c zzeB=U49*R9D~jJj^qxl{_a!w%SDDRS({r3*jpIvH7P2_(QPLYR=WDwWzQIvhf}xR_ zT?<<1$9Z^TGo?zMGe}SP`o-A9I>_4rU&@>!Jc9Fuc4=dhag7yv>oh0TABp91*2FKS zNg|L?K**f6_ypundBFPMvHG=<)}|bQBfeCf=TyX2YBp-|8QjLgRGshBzMr{9-l+sv z2<84JFuw+8KbOgIvFVkB2)&&shV!S*P9_0S)oJeHUxz0`({wa1=~R9kYAt6A<%C|W zr)KoeTEXu^AzQ3(zfdu-Y>qzj5Z!=Gdd>VSUp_ci*%&FUBuV<9;4dYNz>Di<{1JLn zq!Jlrh9WKE!Bx^2X>rL_KnDY}$3))RZjnqSBIj0vB!S$#P3C5wLkZ_?l=@B#>QOR+ z2=q!$#&w6*Ji*8S0^tZrK{$w~!+@c;+SbjHiK2Ah2w?Y`hs!*V!N%jQQJ&90Utr9r z?xR*La%O3g3*=JJwY3@(mbn$hXSxvYfo6~+zZ5x7X1@jtAF8pFxn`L^7(ZW%&d0I& z!LGRyT(!AWDL*_ntbeCM=2U((Kic1OM&1^r)|nb558eD3dl-`!o$(KGFNAZkB!PV> zvFbI=9@@L(wQ*nGNt@v<#hi(uHfX36?jyW&KG+E z5BH4ciJCpvhiX!u*fBX!C~xo@ILAB)_9c`|3%`mdQNYBMP<9YOc*e0Wbk&V<^Napo zRB^eVe#tphv(z7{*n@Ea+F3z7;ik1Ctj5dD32H`iv-ymgx2XeLF~{P^XT?nT{{$KS zlDaw$)+}?*g%B?-IGJpEalrTnTPfzj15LaDh~q0$Z#-URU8HIgCfcXw~GvQ zZh~*LrniLW!?Ip612LRXO;@aqR}eRu?^2`>Q$oj3Ex@h=kRY2*@ z=r@Tyq?04wM7l4-IYwnKp;YAMfJ$nM%`bI7(mHU_Yv8yK(4|?Acq<|C`=ng%6PS@m z^zzdylUz?IO1l^bs_uea%>7-mm|~*Zd8u4J+sx-su*(8e=9DW%x5@FtGmH-F&2Odt z0|1^5P1i5qqGBLO@_jUlBEoQbmgNti4<=0Qmu;im z07Q?l;;L^GXgG~F7VO~08Z+U`xV&fi!cux_v#bZzswlWqbS)MvQjj^wio<|}@0uqS zgIf2)-6k?(TNEu7aLq-c8pPpF1kMioO337@@3C9$1z2-6Ri6SO<QJ{mK^59pP+2Rt&Fcy5hsh%a#C zslH4lbNcgYtXrz1zylwXj6W4m@Lt}KY^S3BpSiRgv6To7|uQx2J-VqujTr7z$wYRM*IJyc1fuAuBFQagNZP?dmUgMqS? z_$E+K;OW3gJ_UlQpYBZNp+tb-zG5@f{wXTiyx*8(qVoG->&c#VNpBV81XxC)5QK*5 z53j*UZET+-VL5pY&YE}%Q}Q89@|np6BFe7@=z1cN9bL0p3Pn&fi7 zyWw_q4&r&NYj@zl(RMQ%f~^5=Fvl6Q z5iImEbXbz$?>i?-i#e-u$~EO)$-a52}i4{>JD~zWx#Gkc!_TcxrvV>m-=2BC0aI9=3<-?u>m1B4Po_ ztN9S0?5j%p);O|g0_+ljCn!F_%0oI)el~xu zYsniVeTfNJvJR|$_{%6kknqcV;}g*Hs!<|RvXgJhL@pFY8)d)vgfe~gk0LE6++u?prT>2bLC?p5#xBU`2ih8CPy6()hfg z^6Ky<7^j`)0YNMcD5izdudee4XNP%Ja;#e43gzK3?vJi8>o~R-tXyyHK@RU+Plh<$ zrWd33vlqf@AaIA!O!CcUjLe_7`2c;`S0nBQ(&dO5!Ga(M{jAa7Y&Dl`02?5W{y5EymuEB zG*|5ApW404_V$=dM|N;zaJUDpargHM9-v)q-C+$SbN?AQrfG|Tkg2`bppj`m5`q14 zzdD~!zU?#bgV+guQxW;yq|kMmRg?xqXR7_u{d*2;E0FZ@B<>}7H)SLmBBon%Bhfuxfv6Y!apTD&27>g$+BX>v4 zMGPbcCuC^PD+|e_vtuPJRyESDzCf#OGWC;Cd@2yNe;qwhK->>+WQv>RhQ=|In>SvA z3bh!hg{XOwj7HkfZ3zaUpshX+;3N!|H55;vwn9=h5FtvBTR^y2cY25Nx@V|ELsQ5M zOE3<9Z9A_W^YbLPJw^1SrLNgT=EsJp27+&2H z6ZnX+mdDW!eqyowB6ps(@J-Ns*mm(~|whIm6s!*>0oC7M7sZvp-Hz#6z%;3#AWO-27q0tKMku zS2@rX=1IjPHIEYrms}D^WnXjsKnO(mp(Uy8l;-q-Wrw-`S?B(?=R{Ihn^aB=xa6CL zf#-h|8YaFqu)Jllpz@?uulVE z><Skcz3$pIBbY(kDKL@A1&5i=vU{G8f6NJT*FERo+gc%J2%1uckaigRB9pU`2Sgd2@2I`-!73c(dxn4w7EE%rM{rb5Ij`9R(1piW4Euf&Ie?Fz~zyLumT=^3H-2PCY7x(DV z*+P{ful(0Ct;@%y@qBXaP_lym6)_{zg<3X7Y;?54)Je#Rl;w4xwb^COZO;VaH(suv75elVSCq!6(@k@z@+L_3z^RN+8(^r`L7(WDw87fotC$=_T1XahLC6kxn5?H2n5 z|DUm$shtcwK+F`AoHt}l+lgUcunm);;4%0j&}RMO7-A_-@e(;3)xvj!$55$74-PfQ z(csy3yn-y(RyG#1Ja!<3(D_t8VkH7<;!*l!Ctpc`SsE!Z&pBzV@hzAI7||u7`SYD~ z{5%{NG%z#rUoxoeliMR*R(Yt0{D9XIT%3y4&DA}I(*y;Qb!($}(x*U80g+<4Mo@D` zvj%hBXS7dYRruk&sQLZ4vO$0a>AM_Ob~%WnzKB3AT2NAMpeaJzYOZug7(@0CrJEpG zNu?HEfFZ)Upfse!t>hHhVUit$9)y=OMc$?U)d#gDxsJX!m7KjMbo*3tZchL4RjrOG z@QKV+x;%veGX+_h`zhlc!I1J_TrmIUDLxRdoEZjChdiMQYkUo_29qw8nIiZLY@oU+ zU_Ih6!5|dHi0~cW7UM=s?IiE3DIvDVNn$gQ|4;wfyJsg`fCCMkcJp#CCtc$cnWs}k zUs!2qqtWG;O!w<)_n7+@Ex@UaR}9W~^I}YDGC~wjP+|z;(roBKQ_KzjVdmR0B$JKv znPyYib@V4De~gXoTjp)H8dJ>Zxv42WBHi`+`S4V^4py%av$AFMKQ>voI)2U&4}wQY z*%k*gCI|hZwPQpKg#_FxKxeKGAb*}QNf>bouZU@#ucG0jk#Jw>-tfF$MW#s;)%IG8+^J{8~{=sQAKDO3tPPfwdR8JC$W> zP&`H8iY`sP)U*SwPU-weL4##b>nCWSSPD~km{avPz3m8kc(o$_Yct4=;(J8R`v`_eQ9b<*a~*LBDromP*Vwo7$;!zE zFO0k2MaN!lT@evVl1ef8^ucV4AT=oDRy?UcGk2u;{@T@WcUg7(o}sWQyd;) zX7atPVgeBrW{hSs0jHH2fLR#3oMpM--q$ui0$Y=eL+&{~T#={oJ&H^OWV~V<&>}Hx zj=CeDHECW-ZaOEx7NbkF4^p1P=8DDP(duKpBd*5bCNHU0^=Zi>z_QSHwY>}#>qF*2 z5DHlpNP9>|D^wG@6$M&l|ymlc7`p-{c>J&YVM^dPNH;S1xA1b3?2?WW6 z%Cq>)kV#8+w6&D(L5(qx4Xy+-aQxFncoY^!!a96CoH+3w52*xvJMhwruk2S$js~3yo2c`w9 z0FnN1(@|bVI#Mn7)sFw@E@r3Es`5&WA{B~MDz}2pAznYvr}J39)<=lbi3j$l;e|IA z^Yz(c0SX?Hjmp9V*vHb=t~w**oZOfc6nmw}u<&YFaIL`ps6SCG6!~Kr2dkkQac5<> zr9AENA{H%Db+e@-1agT2@Ex2j|B83CnUh2l=$OKL(L6tcFXJm0l4;CxK&AIy($kA9 zP7nht+;75Ad@-tpB1NhtR>3ey4DDuN6?3!>QHwm4q2be&eSeu9WW6y3rs&7CK};fM zKb?g&?=Xx<{V16rD6R&Ys;FQnluwb4zP8|lOn zJRyUag-`z~EK!CdTsBDB7yR z0}D#2wrl4ON6$);NyjZR z?-dC^CsQw2(Jpg*gagpb=++3Ah5a?8DDwj5QdR$$4{auRXD7wc72Xe<3BD%-TGZNG z=Fkj0qGXW?4pKaNm_(fUcos-5H#g5A1kphGk=$F|B+A<2IlB zpCifEvGho3Czc9g5n&WTinIi=s0b?nG-Mnu6iW?Y1EYZuk6-qa7Bjee+0tQbwO)fV z8l*mK5e6MOphCG=k-#AkmkMsB$EQ$$hn%`L)Y{BPavcRjBU|jrgJTMl4V&%2YkQO(- zU>ST6xl_vc!Za4al!=-~jg^d8Y9WRC1s5CP4ygo_EGEq&Xe7arE|d*6&Jc08f7gkB z5HG)ooCzVUFS5WhJ9zoHi98aAOuG-z*LhP9H3o1f+)d=pIKr7J4AYIVC0YrWnu$?g z%MwGHDsrK;XU>>uojsZ0g8^iM2#;gD<623rRh{*s@KIV$gIV??eby1HwE{zuOfQxj z-p^7c5|RKrz)|)oi_PF7?GEq<8Ppbes7%4E6Y;Rqy+HF22 zWWfwW(m?rVd;=$xoh8@A*&KVY$nS?Nd`T<5YV zt@iWSq@-{w=n5{hF(*z{uyF~0q(O!$A42wDs$ko1@ijR5?ADrD%Bb?=mCnZ)r9tL= zHdo4GEr{LVmU6JD;0+LJp&2v>2wwr%E$q}Z$VaC+s-$dQm>c1Ll>agQck;iB|J|kB z%x}N&gRoO*y zTaWSM<(%e-_}g0t8Fs{GO86|+&xS$k$E8I{1}q|f|AP8Q)LI#XX+$*-RQ-k~CQEK-Cqw(9UegOpFPX>>=ar)N&Y$_I*+7Z^Gkx zZhN;ycH_$84@{^No9B|t>nX_Z3)o5Vw82Igy( z=*9PmKB{mM3GqKA3PzYiA~%atcJRlAe=XrFwkS?lC8xOmG^P?0 z5l?_Kp%8osWTjGBVI9_?!kz8wD!HErAkD&VWUdez=QY=#>!d|^k%tW8=-@Bpc(Nyo zkFu77Ke^?Yj1xY$mfqR*01gIq7-+kbD;YFYqsYpST3k%9Px}JnJniO!OK?1Ti?27q zWnvX*icz4G-;Vv!+h%`e;5bd7S3H!;Zv*4ZxuyRdju#(n-|V=p$SL-TG;%9xI5BHq zcwLsFG~?KhE;s*XINCLy^SEJNZIaXjHbm2u8LCCbC&lS*Vqc|-sM9gq&^4`*u0tHF z^5g zoG>R!wU2}l8pU{Had6Wm#l2Y0kksr%_@DGV?hC$biC#d3rDBN!38>d)?lP$TGyBza zz_~wquD=dlI+sL48YU-X1YlBV2xNaS6zWxaB1I9FE^RH74jrdZs?YLAHaz%It~4_? zva8f6B%7rs_5uIt+LED#)g_W;$~b$z?oD}&GUufpns(h%3RdR577rC|Jbq3KK_VbR3fla{4$E0%I4@0P_K%7OWXB zb(PsgC;8g)z}w9e(9Y4^rS}1B_L)<_QIIo)pcX2PfrSAkWDp-;XoBCAz+6GY5aIE&>ZZZpp7aF$OaSq+#3TZ&3>=O8cYHQhzEV!KRdSyX zKZb+{1{@eXb8+C6cI_&+O4~WWF6^mADHL=+|D+?jlA@lhEILdVaLTt@ZU21JaqVhd zcfIEZhiQJY%Zd(;>*!x@fapR4w35&w9gM~6LxENoqk{zuOdQmZrxmfG-!d`c;&{+L ziObYF@pu``j8K4Y%z5ohc{AmG9cN^j`*@rpXOJXXmJkD8fb)?@I4+m}eDHRC9_YK1 zh_<2tbEL7>tJRNFJWrOVj|oyDpqu>jAk#NGPzlc8eP@4fIbkm?Ttn31VLRHNY*0>ETXhl)Q+zy$mDc)gTG4 z_UtO_4Ke%xEHPCl>Rd^>(0)ly;90>x{M)v^^7IypPGeM01Ys<5F+c}(i3No4(+L=K z8lm4D__T!2hZ7a&ts(UC|CC$LJjH4I*~@uU;|dIXAxfpvC#?4=R6y6sGU+7i_-I(3 z|HC?|qvY{`Wa0%J!C@<9d+ll7#-Yh8gVukVcp(y+4=eN6DbD&cp~qR#h0^stMMa0% z`J&AM#c4Je`#K9;3*ZaDTcb|7QFKdB@&`}1&97EdvFviPKFUAy6yPW0jbvQ3ltn6% zM!O#9GA+^ZC>cwsib2Eb~Gr-`Gibd z67Aw2Mv$XFX5gpU+f)jE9N?-DH-!DURO!NIt3_6G0hwI(xWjY`<~1V8Ts=;Y;v>E+ zpISsa@Q!zl>5+`xyteWR2>v~PUY}8i7{z2}K54ZE*(IuCU~s3q)C&ip7o->)V;6jW6$rdF(8jb-`(GX3#Z3NAPa>*Kdg7tXk z2-ap7S*~J!5~@s5553`B(vChS7P{tlzt2E860?Rc6iae^WXI?z7V4wayBE0;H9yu$ z9CnsAgu@A~WJ{CfTp1D+IXXmtLgvg!IX}94M;XGN3V#M!tg31-7?>$&xZd^M7+3P0 z*e$8ZfKSXtk?Oo8xKv1N0x3Q#QRWfBp>pBX$T8qm2m&O9{W!pneZh~XV<&MKyHZJ9 zVsh6C=n*SMo^H%J;b3xZAazRRgECe-7Hb8S+6Nbf7kVJy;`bH% z29BOr-AmdwYvxV#V2SZfFq{kSg=oKGu2qs9=37KDaCXtl#ZbG-#I7_)aKvhbk#_6% z6i!ha5N2mKISq?%cbC1>9BJmVO-#50vR>(c3zPeJZ;?u25!|&!=T_Vw4wf{g3kN#- z_~vuS!{gl{n8OceJQPX|aAKBcoC|!W6m(@^FeR+Hwgs;?>kE~u!Tm|=+c0kD59RAP)_Cy8tXYB>DL=^sD9C5o?LR>&hP9CN4tITufC4hLHGdr1C7FbvD9_gNjJ3&pe(r52E+nLkovDO+k3|=rr4yYOn8!T);Z9rsc?gY~-*1b+s0e`m;bt-2k%jGGu&os+bD(~O? zAp8fahAGN@Q$NJL2k^){+Q?H9uD{x}`dK=0Ktzc{0VT`UuNmi142`5oa#NWYqh`NM z!kC!=4lR)A4mF(oT(U!hcrn=Dc$S#{+y&kHt7D+i(4+9CbF1L7B^HGnJB{^R51BBN z40yPG`^>b;`@+`gq*S(7ek+(;ptaP&jBP;C#ct%9lKx8FbF%qrWRRho&gho}ajp8p z@Ho+?nO8%-&TZ?@t)g@C`-q1X2-#D?fvRU1%8KxUHnYP$`tQC9dvOCp0S9s!i(4fg${F?BZd3A7lH)HNqMc32f)H1(=e%A_7g!HD6Z37 z0}UNDpuJ4rkZ%n?QS@lx%VIs?nmx{%?ck0wo35k=NyDGRhN zhCRtanV3rdB@i8{cW_A#R$1)1FpyPFZnrnBu=(a;>l8PE zYk^w^BVnf3dYwJLiuf}gI}00e6YLQnQQ|Dhx51#5#UdlAs*mD}!X|v!-j7w9dSTU3 zrX`NDoNd^gMAz{&ql|vd5wSH$2n?&Fi7p;fKRLRLRhr(T<|KW(Hm0gYPUXp^B+PO_ z|G~xuBl^g+rcGi_k2u)XSFIt>dBb|W%E@7lhDgqUCd~!G9}_1+!P{G{_h}DBOS}{- z6t*@aBca6EWe^IJq}ELMvi&fcV`g1q>WJ3sU<^%OKBQjMN1-6g)uby=J}6^wEKQ0b zB{MqFn4I==_S2cS21YS~QUSZwD@|{1Phf4MeNt}PbhC}WV*6PXXet;=*wB?41$?^7#Bcnof!Mul7I35~nd z3q2AUhbiEN!)NzkVKYI3Zuv+y+^5lEnO;b^G6%YxEPIn>FU#K|9oiq?=nZ0Kv2?2Z z3Zgn0HrGa5YdvHDaF3hpB0!aMh$dh&;mPR``DP?e(s>9&;M&n%8o3gVUviCgWgQt! zm>WZNvaRV2Ts(a0y40|s($&-DI;xeLgV1|oNfU&#n8QeLqYqzN<=U|!SX7wM^$v{q ze3@y1@y-x>^FUwv?(Gmx0~hXVb&cQ_39Heh#6K_!Qp7!!|m3^@o| z3}bwQCH>uzA=i6z0ClVlg0Lg6SZBn5#zsepNgel6a~~&;t%hOAyD)L-FUxukD;iL~ zJR*urwyz%{!1*9jU;1@iTLwd&N6-D&-quICo=ivhNsH+-!mTvb%%n#7FO(swmK!7G z#%Q@QR>r8EW&H7C>SC6gyLRj-;WHs037o?^$f_-D4X~F3pq^x810td93OcsfDIn2&M^7s#_BUoulS!3CV|XrL;{lyU z=2w+}e!>s&=@4a|pO1Xc4>NIQWJ2lQ0rlSnrA)g_wu=l61d=q+luu$9qXD%5L6QSD*pAp8>5` z)!m(u)1)8cs&Ho{JoNeij)yPAg|53oL>=ZhC!mFi3vv}IbL>!br{-ry(gqyB)k`;- z(4G`i8}-S8>g#MU&wy;7=MT{xmSVhRdsw+PH0Zjmo*CUTDtD-Q2RT@<)HoqO3AyBQ z=WnyYIA87)v|6vSPlXRCI>Db;i0c zEYVWwSH$tvR?HMRVhNy{)wRC0j2i865Wat ze`J1E^7{-vS3;b2fob5V39b%iGb>FYMb0&*J)M)7gZ_PY2>H4?-gK)ChF6}_T0BcH z@eLp)U`qsy+nvRd9*1;;DcKqno4Z0FR*oc*Ha-gu9N)qp(7xpS;oo%{iVzSl#r)(r z09cwK+>bmeb=`yQ%xBwu9u)2?pN0AkQ#>wzktvQTo@HQ4S?850Ujt>T+New^wp%GB zV6#=Luj{?CQYpEYb>PzCU{$EWuF_SK_s7(yNHCG?ia8{1x_Rn^fGk1FcE~`=}wC8R!HqPw{wh( zoy#L4%lUfwp9EQlprZ4`YMegPB{m(iUIKmTKaY%q`8YG?5}vWi;H7{8FA;Ky;xw_y zn)5B0=$wO}xXXs1l&(^0CTMjbK8*m0q*)0a@tPVg>Up%79Tzkj2~SReA2k%fxB$^Jv(b)rv z!Kwh4q(B6qK!!o|gySI0BqkKXY}DFQjp>T>Pr;V;ntTDf@V~N^kE5o5;OG zoSDDwvtBo1|J=N6H82Bm+@J~*sXJ0}Z)<;)wGs_zENG#jL$%Bc-PA`7{K1|N6G35g zQ7@}fM;C*u2r!lYj1Bc2esJEO!MW9hrpi2}+P@W+>Wm)rBUor4c`z#Lv!a#)hJ16N z6~W)E)aP{lRV)S{nuQUqX?uQ!=3MAQ6;vk36Vw0}r0cSy%3Ex7oO5ZWR<1g&!ypa3 zn%lI+SOCwXglx059Ld@tq`>?#J+64Tni zCy4`#W)nCNv5X{DyFj{~Nrj$Z?%~B^UaNE;i68u78vbGJs3u@u#UxLOKIC`>+52#Ijp~j5`Oqj{ncqXU!)~>P7op2j&)r z52nF+#k|CSr%DmF946@T^eTIuv6k7$Q1XG9(Oo-5D+HloWVLL_B8|5yKv}^O|9*Icoj;pG=MBF+b3-W5VV=SIM)N4sz|~R0 z&mk7HUF!HGlP%ajt$&z1QnQPC|#VWzSh7zTZtyRkxBE%QqX``zMEnE6k zpq*G+kE`kwah2IGBH1fF`@q-R$j`u+q9c~ziDel(2< zB!#Jr^p)U3JM0yH5cUC`PSAb@fX4X+W>6zC-yQ_sG3$}-5NS*t} zn$9g_(&{qL@hWC32_~p@x&sOIQ&?~cpb>W}q*f6RhyCJwc=hjk>07)8I}-sbekDze z>pCg5uyXuOF=NP8=ZYNhtL>y;UL-0+gOz5}*9%@O%U#mg`B57yVMH67)(EPks@_#G;Mu z$;(-d>mmzgh#4MdYzqBh_g|x9_?aCS1b2s9eM%K2Maqv-(_q2lQK2b?2CIBXr5Qkn zlmL`NEUDFA;|TmDOcfdcAF@5L4N?x~W{hAZ{g>~kDDuSy^P89~jkWC~IC!|uHTRPd z>+iN=lv(c$orcd&H)VdB<7+>=VSYVpJ+d+~Jk=7=tSw^)ehw{ypjd%~7idcfh0Nny zdaR2qYsh{iQ=dSVd)Vqx(s{9 zZo9c)1V1vK=wH4$SwzpyN-(zcyhF`yGTj}tm58zu>W6gxj8EL+G{F@s-n;e&`s2sh zljdtRIUAtTkh`ag+uQv8wJkJ+xH7cm<2D;w?AYN}RIwq_RgKGP>Q;mMN6o^IhNy|L zUz6BJmvC65kLtBsu2R$pInZzhm)7Wk(6T62_G?MDo(hKr#fzz9wkp! zvR(0N!hRq9+F$2C7r+Lg-7GD=fR0BKoQ`3qm-6e_?1zXWk`$IYZUe4YZgS}ZzCw|f zJ9<>$i}8)=<~p;mTngAFVz$zNg3sxf=u0)vN0V#Hy%GNkZ=&5Cfpl6$8|xticA&>A zEY_zIedSwlhYGqjnc5YBRxiJ;R$$&CZ;MvZo8b}DgU%#tjfw8r_VpE(_`aQO#zWx) zuP?P)b-j-G83o%7l|$E6=60G5PWI)_)*9Z*vYVj>FfnGZI(;meFszQaBg(Q(s3tgU zX6l}1yunB-#>S_vD(+03Sgcb0gIHHT*YhOVin32Q;x)g}`MxTYd_n@`r$$np3$ZHI zCTqp2Byor+h>YL|szCCIjupZSdqg{uA|t;BStR8e%H&{D@YUl{qe6ZVZ45P4GBwn1NWL@Cu z_eU+Za!nPE{ocK zHM-tJd`PKzM_0dhNVdr#g2Ar4+){`F%l{rJTM`vh<=Yf7Ef&bkEGCk=Au}J^u&Nl$ zvV^A3aTw~0X)oy^>yY^BnWcLcR`V$cT+mbTBV3ess^xT=SRdyJV?0yEyT%O_hZakZ zs7;^q{4FFrWA{94+ipJ#(I{_j$66y6$dF?rXBBWU@yp;j0yMs8JtnDJl9j{anMs2- z*E`7^tm3b+2;ngnhdTH&J&*PzOCQSre+&{|4PA6Y;PvKSj%e!kG8b|CAw9=y()#Lkpw{^;?p?MA7(D6L=tCf79HNqo}Z?A75|wON9~Jq)@puu%VJd zTu0efKca9#Gv^o0$-_lm2bbP=?w#EGAJZjWJx_ZlW(3x(OCky}ZKQH^;ERr%csc3lz<*H<0(! zW#px!-TO6!uCfq$iu@{g$RRPl^5=mG!FFj%_iPP^#8A+($stW`Y#FRobnqp5l6s5T zhgD7T^-vu}?Q!UKlkc&g1S}<4lejj&^iyH(xAnnD@Q3?QIl4`6rZ-aIt`@Q1U@l0O zZoDY8+1!X(rYPuPpDjs{e76K`?N$$S9@HJa6=0l4=%X?LX(&QhO2*9D?Y;mvL;=i6tJRTW4p*x#f6Brd)z zOyE?nb3YB|=VUnI*Jr9*wn#isiSWQta_?k{dY`r^C56H(M6u25>bY&^F`ORJRixpm zU>4KqMZoo5fdf1~Gp6Gu|ym7YAPW;83!WXHWfHp1b_r9)2R57d$SbvN)1rU4$6&kt+X5H>v} zh2%Pva;Eg^ySsV|C%UY6-_qawW~>!Z-oqr&0~>H%E&Ale`vgsZaXGC;gPgyw>r3{r zH$zr|jy6R7)SYbNQFP_#Yrw{WxCmZ5{SqHP@2uxg3GM3trMIy5rO0_qL(_}SzXUt?*Mo*l;3 zJ38phIP)~bDGzeI1SHFlx(px{*)kEHdlHU|W;BDA*Jx=)93`WxKQV_M@izt!$s!<8h+ zri_4|{&nf(Bimudobcxr1|m*u(CV*l2yOs<>Lw_u`^7L`6FbfuaBs+qp+@fsO*^WH zP3R5Dj%u>TYs0P04%ykfNZoRozkkCbN3{ugELC?&xdszG&C;=SPmTl^^2L zSYtvLbUeWajPdzZcW2f8e)Me6S3$w0&LJ8GHi2Vb`I~J!_4=KBO69}RR)_2GCV@hL z&p2^acLrZ>*5}xTWb!C3^RP?9*yJmHa<#^Ig?CAIp!yW+lWyISxE-9)xHn zCx5;I*hup^{}sL)ck!F?6NDeCP<5JUW4D`svKvXe&T_5!tI!s<&3IHqmpL?{KRCtT_zY9K|%ar$R(Z zRpM0Q!?iYG$h?*?;7&8aCnY-%I0WRWmXN*_nYQ2=ToY_{zL!uW*~l}{4gJtO=Ue}y zbpl(1MTo}?M)9D#c4_c)Lyt+S+m8lTNIH)5yUq^CD413i2`&Izs<>mg@LkM{99h)o zVCnu^?@=97KGu<*1?hfI;}0ja;PSm%Jb4ep28X9z>n!dUe~Tr?X{56a9!0#U3;}H-RIg=1&=-EP^=g{3?6ljF4%Af->)5KNk6^kpuQWw zDAo=Ir8Oj^iYKuc5KBZdfmtIx;;&H5cr3;k`Px%gVudb13w;JWucU)a(N6IV0hoIB zb1~M!!esw-)>k_wd}dqB92S$lv9Ig6@HJjf*!o!bPPWv#YV3lgOq#-zYP+b*;m1?s z96aC{qg1acR90WAbv9F5868hgHYXb^hw(_>EaJ>DEe<{*=*^_+`N?i+=Mz!n48hhV zD082&z7l8+FwLvZ0-F-Qu`I$KhzTu3pO}=nj@&Z)_hFWCE)B6na#8SIj)C zBqdHmA%P@{GmF;?b74)L*~9)ev&aq!_tYXYE0LRuov=~(>b9@;QsvEyAHv8`&8 zm1b>{TAQgUZ9W)i^&mM<={+EF$!^o|2v!mXNbn(y^W>3sRo%j^o|qA`M)+ZhGJ;>Y z_!*v))B;d2ccsfbFSnAn2#f^LD6MF9tHGR)O{^9o$#en?AUcX*Hl&WIt^^QsY5Qp% z^hwkW=7?QrPDnjT2Nm82Xit}xn5Wc!E&X?mIZULcQdnc2V8v0z8y31`Y1y^Zl(?PL z8-Yuj!yVPjwo0rU8m+`scoD85?s4%!tUD$s9l^Slzl>-uXQn%lYf=x0JIRyW^tF+N zc77InAdQQ49hB05-HTg#&-(fG#9}C*$y2_=3tj{p-gg!y9Y9Oa3mZ`|wef4;v?6Ty z0P*T&aH2uXo6_l3_EU?n=VEFo)EoJLnOcu!tu}!jA$yx@t>L}iHWXNG!W0D<=~XHL zqh5f5cMV!buoBiP0(Mr9#%Z#9|vz#*Sc#te5jTcO$+w2D-E8T6zvA0Xht;0~m(!gYW zBu3g-m`8nf3tVM7`BJlAoV%ELu*6ycqQ?Dym+Vpfu+Lll(hnBy_zQ$&h&2g&@3D?> z{Sk3pZ&x{?T2m}@lzVTs0)MlN#7M>yLF&x&%Q;Kgzl!rsn**AXV&;gPBllk*Mm8(n zs(V=`pueJy>aMX5QawT>QQkuz%M4gP7r&PIw+Njut?8OTATjCFUDMh}1Yl@|o#{Rs zW)G*5^Yvp2?1^`GWz@PbfKUygvYvo$fEMMi1W4l?a>^+yUV9}xRLewcoFq*EsL^E7 zZ(lafqkG1JZ#y0wO4mOS)K{VJP#UnU_gsF83`{;sb+#(SZ}vgwE`7XzVG#I9;2xiV zWFccrSv)){A%M>&W?Fy@0c2Gs}QhBSU7xZ05`Ti`6nWBbB3% zqmUSOMee-;4BeyTKB#M^t_ZSFeXt)BohnC_U7`61;3PW8tpCcgR;nZGf2R1c&LN02 z!!1iIHV0}fZrBCM+Sc~tG#I~33F#jN2!&bSdHpyUv#B?Q1<*~CYAZ+Vt3#*Al|dnk zp9?mAAat0=GIIO&>*UosmD_^$dpf3*eqjOcI*TlVzRF{E4X*?Iq4*`+-VjEBhB!cq zN~H?!Ft}42)=NwGq$l@}PJ)Bf?zBQ`VRY!qf>o!~BQh5Rxdm?u1fJPBOmqu5-~f|9 znXb%)dRpv8_ZJbnJJhm-h7&^O`5NUye94JTtNOjTb%FzaJzGqHPID(yA@UK@38x4% zJ@GSOs?v9Ww*aU>=^BDS(ypg&1Bv-Q6%JDpqDH26cBOUZbuA0tFnkJ}R)XMU0jB>t zJbwqiFatMF`VjGCsp(Pc1dbJ>NANUUX_$tZjP#FZB%%CznTP@I5FnOhoHBf@tah5Js(1L zBJq*(U~tr}m97p4mIsg%!6HnSxmam^BU{3&tPYI$1gd+wIDp;VW&-+lvc@FVyrE+n zXhZr3U$|?dgV7YZXNZV6;i*|{&ZsUdKZOJMj!Q+sFCL`~^KYU#zGSDvjbA^BgzxZQCBm^PL!tP@2LPvGr4&6|$JH zB>~f4H%PA77&?W0=3WpYOl6LK4GNNo?BJk0z-iPY5J>3*LBQja=S!?9Uc%JRY>XK_ z0Vp{s=Ka=BKciPGq~tTB8VerC@HWSk)yXIq5 zLSioS;P^5%63b!`AxnaRSlY1n3{B~{B7j1g1vw~zZ(6py1g-*PP|j~YY`xPQ^?7wT znt$AX8jhOkPclmr51%#r!7?(}M_*ov#xX$|m6)UDj@DpXGC%Oi_(j;bq@kv3iF#3+ zci5vmemy!LWy1;sPF2nE=hHd4zt!C3f%UcOC;Er!YX6t0Gl7q*su%ycGiNfBeI_Sq zrb(A9U8XH4?eR)sjd4+G6fDRsDs@jYUa92a~ zc4QkHcM$(Mk*oq95;>qVk=!l;qJ;{@3>4iQhF9!H=#L~(XuxF%hk7v5!ISLk6)b=zbdR|!r1?hen8MV-pf)3j{$`(B0W5Ev^G11vy z)l>xm3|1mg+kQ!snMEQfk&nx-^^%-KQC(_mAB**N4 zJXpsf@6!vFgfs{SlNf{(mkH>BpD+|8e*jU;oK0e5*ML+pOolTeeS}-yZ=G8%L8XDS zC@Rs6;cCYLh%X^C;11XeIGWo5H=kb z#&%zyuO*C~1dqj31iK_$8xc6O&1jYAlcB5MGAEJielR9S$)h7(M>q~VKZc$TM;_@L zJ|E%O(RC+96UGR|x=-OZ14+PZ63rJ$51+x2UaWQGluE#V@ewy1&RvrARBm>>sLp5L z8iI*2fD@(%Ewnh#CqYS)oQ~DQHG!2Ghg9^8^F>hkx#N*nq2dRyxkOLW#h3#!W|Gh!MQQ+hYsTi$_2Bn zBxD2}JW#m3U8XwGnfztmf`u6;Wsy+}orSLpfbBMGK3^Dd?qFBPe=+)H(!|6LGLv0e z=pneqLV-xd!nQ92exTy)M-Ur6h=(xZJOl((cFx$yg~nrW*+tu6IKW^_76D89zIlVe8aJ@5X^#YVa3YF8sFettJ4)T&Vigy%RfIF#Y z98eHQD_~H)xEVQqwc)6Q0t5t;X8@uR)J>+RGvL?0?qevS`Tw>;whWe-#8=F+Hd<8( zB|oL{^IRxlgXc2 z=d&eGDl}#{aT>@e&MHJb4G)LW`#>IsBi1r&ej`$E!e>D?@gK)TEC59NDV~W$!DW}? zbqA#Fj2z?CcILCgPvNJ(i0yQ1OX8^iDb^SDceJvZAwzb-m=!8lImu@_;^peF;KY`! z%bh5o*zunKBrroVz$>STbd?t@u$@@u3B1#J7-bN##-45nJ zTx4(9Q=W``C?AXLH0DfVHnu$|ZUwAiarEKzbLymYD=oD4GZGQ@B@8M1ldU`Okp`W& zo~vG1muyA*iqtGg;@c!?y2-JgmVMn8BdLHS%z}OsjZLymf+>NQ(J6Lc=&@cbPKfje zx(c@*s5O8lu$lx=(Ds=aZsLSZH%WvMZ(gP<6FocBt44Z=y(nk|3LebHX(u^50V$j? z`62_I--K!yaR36)7Jw7Tj#DhgSug^>khlXYILAOA3v^_tXg8_qtx%&xmtws}|LO%i z?gz!BX%@nj&FV#wDcWYbQoQ{d@HEE3gd^c9ZW~cItA+Aj3}rC+Vu`^-*m7G=XaSb) zFdl$m~bI=V}t7 z2!AtgrHN}yQMAXbd5D?E9L~eaS>$3_9u+c6EX33&C?Vq&!*kU2O`m=q^OODtHe*9) z_bLnhPJexYBqcnx+PY6&hc2u$yn>GbiNnuJfI|eiz`CB!4Tya(HtB!TE|hE^RgRl5 zE^9yNhGoop)r$xb?>O-r1rHv|*^a^Ot?GmXHWzs>IS2uh_btOeu?kupmwpI?aFwcP zkB#M-RF`!~bZxD-mQ}{$m1y9@@$1zGiNGU4e3C=TaY@nj!iI!5TTmp>mo48~j^h{- zFx#_YKh7kuJ_>cU=>@}+1yHwa$i;A*eY3%aX0T?CAzfc=%!WV z+^iR0sjgz{L|+66DwZn`I1oU#_l0|(T1U3lZuEy|yD0+2BhrBs@y4oPEDkz{*1UP-^k+x*~ELVg4WMIvKF`sr^U;5EUbn z27jbV2*=+M{_o85g@6piVLjXyi6A2`4Bp64Dx@RN=G5Y9mS#jShIvL%h)*D??t2V=9tjQIZ z@^H4bS0#FhkICjZzkA-xiIDz?ZeZ2ZT(Pp*XZVwFnr!%N-Z_^DW4RJ37iMvw7tD8a zT_?q9DvYONc!g^_1N|`~yK)C9ajeUa-3dO@Fr5xT%XUmTI}k3+!@-<+vM;5G`3_!3 zMlMl2#kdL6A}syzP?KY%%cBlqY~9`d5%zbVZRVEjAkuArtLqU|z8#R#V$qnZj*>rmTwbY!k6AZg~jv!7W3v9Z}%*=)^Ddzb{a zX)kNauXZ5&^Q%|$<*8WKK)NDZlq-IW3~LtK^RUy6F{T9mmAid@O$K46s|z|-Vyr=y z5GWY;COH1CTb9pfbce~=o`$f1G{7K$CqyuuvT#&J(Gbg4?-QB@zb3X%c~j)}HH1BI zAGHY0)5O$66u@n{0JEa|TM9e-f61-gKWIYayb~ajsSY%t(nZG3YRFbSl1T+Zt5gBd zD`H*0ay!-%GzK&Bpt4RxobOVng_4r$M6G>XozJmdqcD|*(<4kI*0YTLTj8KQYF!(6 z{*VY0lh1i8x;sj>7-F{EhygdY`N)h|!ZHS&KXKrC4n4`^8F11)qA6bE|t&hLOTWxpV=x6^R*7~TRx04`P3hFI(pJpt7*`b<_r+^Gfb`z)=?f_{P+vSjeVLa@Q zO1LBRB+xU@JKV%yi6oYjBU!?5?-LVy$3w(ANo-z9c}wi#?|2VM1!3VE zEh|Xe#v)`z#94&kK1f&&uv4VTJp`TF8vDH_@HxV4Zps1HL6Q;LraE55HtOzc?f7xt z7laEFl0MS=WyhkAlE|^eBmohNZR#Uum@E*~2NE^%VHiGEF}N1SsTD{FYQtL%q$f~h zC*m52VkD$FM1bG{*khCa{gPH+>|yyjKyBXDP{ozFBr~^Sfsb=sO zceFpDt-5k#FC>KNDpT5WVd1-941bG3j<^$NspmoA^oj~3@2XoJ5;bK&DI0u4E6Un>X>k`SWI}na1mld2_PNYY)zFZ_lLC(>R-hsQu42=pTd`B|%wktT1&4iiFBT7R$Hihr6d`zB; zWyi4y@rr#7Q2|a`3z0C(|jVwZ=61xQ6@S#lk zV(M!aeEo504&kjO*aZ>JMBgKdZtUH#edzZ}lB^KIM)^qa2 z)~B*CVozJgSe&2ai7=yogSZwJre-+Ah$BN6P*h>Xun+!vzqQcp&tWW1}v(;0z?Rb!6s>*oJ z+Y~&Bbw7Tx%K22;exON9jXDkOgmR#xp#`g>HS22Sd{kS00cK>y8MN+#2%-ew=?=$r zBCN#<{WxSMl9A@vr_{r7q?rgmh;bP^;wg;p%R34vfXhveRWPJNxMr)DkohrZ1kDoL z9=wVP@hnOuNCgmS6&~pjJs7$Qg-W8-j#T0Y1?MuEP`M;VlH|w}<^W$5f}Fe6fu@cg zHr*YPS@a#*<*Dv!C#-{5?_9RdeLM?h=+ysr5-C^ukpgR_ z+Am6;V&y9Ki4QaSwH&LmJAR2vu5dCMIZ{Qn*ZJ`_Kyv52F~{*a-#H~TC4vPkwRo85 z{JuWRUpoNzZGN{FT_-V&w&)g;x;cV!JW4viP7XQS*UK8`swY6cd50)0Dx= zgB=Q!^hj9(Jd8L!d^V#N_tW$lBDD*V0un;>&Q1WbNEBmV>kX$H?WBn6F}2=B2(&5TYbe4&_PI6~U_Rqbo|K%Vox_@y$mEVjh8syq?fyzCJaZh&80 z03uX*0xmg~+?n1f!Jer5t;i~pV*uNWaL+q)Gf2nBr@wk=!O25%zCiEW5k-U-$Ls?+ zL#BfJcvoS0s^@&^nVzN{xX(+5rvx4+aO{9=1||;)2~xPoF^mr6+yE`edP#%+Ty7yPGvr@ ze(v(EI7CvXroJx42J8P-_I!r;*8t;zEn=JiE5LbAb@n%sID!+Do`g#9Quu77fz6S> zWY0T#T1kp2eya(wC&kGTuRUx6eEk=Qn!tPEXG+nUW10)Gm&`3cW1C&05SRPja zkJvvBVUN*%<15P&?E!vWv#eHBgLqd9zn1ESh`D?nLd@+1UG%5|F#ai|6XMf?W$A80 zY={UKn;c{u@LL_fbsrZ8LTPL<2?*`zOhI)RF=;CNE<5x|6RB%5Hu=N_O_Y6Dtgw)q zu=#}ENgU~qcYP|CB)qqAk?bV$YrZ-mD@0J+cFXy%E4~0d%$1m7lTj9WuwO5@2w?$P zg+BrwaBUnIjgF|NR@U%uT-Kx66=F_xRCL-4YuW-@=0)DxdRF-783AR*p|>`_cv?O@S;sKK$q z^kLlu6B7Mbfv+#_wlPs7oI-{$ygs^|fh3O*BS7Hf8>WVCMCSrk^d@G+21Pj=Ubsc{&JM(z0 zqtS!dtrnx$u9aD)uVQ5_jg6KP>NeRY3NJ)ETbv7uNI{8YKUOpS_*I6J2dwM+;mlHh z^*%>LBtgT-b1>IA>1)DfC)y(Zdj+sup<3I5%OBfUOcAlO z5;@_s&J5JE(a0;C0+9qn+V`?Uw7n-$jhkCo&!eh0xUw)V|R@j)i<7Fqr|&7x6-t&V$ck+0e*v+I_F{gyaxtL!;HQ z>J;#pWyXy-0s@T7zTv4A?f){{(ySG;0IVpm!im0MtWGB>!WW9Vh_r}BGy+&T@?FC9 za7&OV$dVKOf7ST`Ig#)^nd(#(`{LWjJ&s9nTR*hZgOYnY z3@v>Cn34E?2Cj;jk|Cwx0Q{AS##Srb)?}fy-A2=LkSBD;#NR|7v@$e+mz+oJ09Eo* zS7$MRup0|cBngtg5YhUHDtQ36^IrEhR1l>5fTt1TxX7cq3yOF*1QsBkWhx?GT^Y*y zRq=cobPnlaD1&wo`b>@}M#a&XGXl^Eie%$~z(h%%dR${g z5j!IIj*<)U;1RP^%=Kbj~>UX2`s&~23f zY-wpLB4HLX?mRFvZpLttOZV0=zGl`58o*dW0ZP%7Ni96gB$J$x zb&B|x;u`^zj!qW!1G;~}JAwoD(6(R@Qtq@o z=~x?$Xdikdr`K2BP?e`{w5=Jllxdi9vr{M(!CDKNoHN))Vtn*r1TmkF(iWAh+y%1r zbR6Rh<4V3sUypTvNxcGmln|KUCVBU@H7BJ6hMAIb>3ohfO=x6H@VG4$BR(J!VZEpG zV@NUylp^2=FvVkJtj#DKfX~s$rKX_6X?;W38Zj*84$-PfGmJ)6Ac*)z_r;DO#{!Hx ztoahYXKUB_n1q6!l$qmsZ0M#)W`ee4GXZ)HAeDa>mv#fER;^`-G>8}DL zy)f?H4Cq|V7}2p+>S?YxJzc1 z`3ooBS%F;bB)0Stt-Lk%q#}Pc3kfTcj!o*kTkJU?L&OsTV>KB!5prLtoNro8k)y(O zwgGQqxeYVjjaR+XI7rbaek?z@{7LS^e$!)&GOtpfkbro=gj94$U&?;#xfN5U2pWTH z-1Ex-1a>$Yk_YpaotuTsRB|fn0XafU{={+4B@DhqM#Pv=Mo^S}iHCF(XJGloobZxs zVWBGT2%q14T;jkJKp2)wY?j=aWNmoGB@(a^76KDv2~IiB1)T}VM;;FlZ3c*?JDiG2AUx3Uc{+^>uc5w6+bWxr5_9 ze^{<=g%q9BmX5ZHAPSdTeQq0Nzi1XQ&xZ@~ct!x`&XVqZa>_WlOdG&hh$ z5P#7+r);7~M2O)INu!bts+D|iFx0h{b=5}te(Vu`>vxr(cS29e|C$tJI|Jo<5=KgzgQ=$6G9B5B1}2Tld+pg zNQ$hSB@CDs)f58O{>a}}&0E6t%N>YBAWag<$9~aDK{9o>|3q8k%L-FxvMi5fo9*_7 zZ?_JKjaMw7wvnMaL&K(a80TI98ICDsS)lgsS3ECh6Z;Zdfis|6$$HDukm^i(WRV01I>%5R;e{ z!-;{0L}AYY?(XxdLZ~-=_5#OBbxPI{c%6OsZR;dcM_vuYC;#O(iG3C{Ak0Mag1lOk z)~&AxPLjt9Gfr+U-@;$p!bXD5vM^%dKe^{CgffXCS%6oD4IsgjP*5FviQ z$d+j?td#H$I4>@EW!t6*-$CB7acpzni^DYB#Su`(!y??kR~HZ0 zNNQj4N&FUtqK_v(owr|GlL6Pi;JjSwZngU>E_2W}H~ zOH4$CK4raJTzNiA*lEBt5z6=w-;ppfcFtRD#r}}xgh86q2>R}H&2NAJfqJAgmVs;# z=|hrlbjz=kT;m)%#9&;G)C4L5+Oyg`MB3mI;gE!c;{$!jInIK8fvZrP+)J*&Myn-p zll4k>+-45Ri^P~D2VLvC;~2*vzZdt9FC$@j$aM52t z&NZW)4|y!3?~($fJL9m~TC1FCKFl4PXBJ`4S6mRXjuY$tjp9)HgtoLd89qW1ZKf&O zQlucqddV2TJV;)&$X9YvT0(u`yC`d+4|7!8B3LkGs=I%-2;pcyU&v4ggn@?kVms06 zMA#xWorRMcrxLG*2+&wQ0qsVLS;bW>oib%2TUS~2Mb5FBCC=0Kq7aOT*_ zjuQCytAZYM3wB7aVnq+O0g7^!vrk!@y$F)_iy|xr-_(I~A_xQXPUJ*6KC3{L$HNb2 zLCZ7m_+#{5qA(z{P3~sY4)lBfQu<7pwOu&|NGm7p5{|r`mW#;&G4+qd@a3s3Sa4%$ zf<6+&E<;K@O8x>Qh^z^C@0m!gziqzY6){NZAPy7)x?y zlKi=B>o&O|Q;NikF~(FNjO{{GxamvQnFwb3+=4pjY708Wz$4>K>3klyFlpe&i7@P_ z{DdUtZPby^TJvrmfGm;s%aM?zEOv&yH@J`?7!Cv0j>r)?*p4zSEm)I^ut#i+K9q5s z+qN}ymO4oXv1DQAefpip2`7qBuBY=;+iMZHBkyLhfz+`_D{sSX9J`b@!4@=J{P>W; zf!>J#bxMhSEQv30kXzPYsuorv)yDZAR)RekBR>(20;x6*XafK$9EDpK;Tr@%zG64g zFj&EspgDo(gS^%|kZ1kLgN5Q3oaH^ukGuDa+v_rQ8Uzl559tm3Mwx~Qiv@BQ;*P{0 z8oy__99(0_r;f3a1ozycPX1qz;}T^O2h@$M&(6pVYh5dpb0Yf??dgg9q0sS!h2vT6SCFUON1EfpmJJo{wEdxI!p%V8*9*LCi ztjhSYI*B4hZlqe<)x%3lN!vnjE#$u3KPI+{oPck|x{NcC0ezBIf)>#bLKjC;rK++G z%eu3u6;OdwMof@@QS>nSVG^zjp9vvB%JDPdDrr@oF9W+K(JLhQ;ef|#j=iA>hT|C6Och9{JHk8E znJ&0G02?OcVr6a`zEn<=9j7d4`s-zkpdEoV=-?7_Yvx@`mgX2O*r91{H>czl&+EDm z17*Y|fJO<9!E)&9cIx?zzHTuIPNktN5l4_;gCRtt;rO;wGTTYwL)(QC^xhwDteC%C zJl>HeBzvoo!K3g;NKe=hSo=g5M_bK}PXRl%YbpB}uo9~b1eIO<$9f^_RZlMxN361h1_cP=jew#Pc^4+X9?T?d(n-vhY!4o}?29b- zE#z`01R@7MMS>u)h6V5OemwGJi8bEUmZ45##lsDj7xX0&g3;df6G;Q@Y%hD)7rs8P ziWR<2eM4>lReQtf^bJL{4$1YS*1Ya*;yx-Uk^e;EwkfBCEVpx-$->FUk2a78%*O+( z<^9}h zQbekT|BJL;NAxx^C1aDYRX75ckja5mL_TqI(cfbjZg373dn@0EhzV^UW#}y&e8@{^ zTaQDzh%*^C&LmV%5Z*|2uVC=d*fS zW8J;2>E5=c-nQo6D7HAF*N=s((Ec@{usZ#g(Ucv1$%SYG@hsiUQK`2wQsO@(OJi}p zqd#V02vDeQz*Z&F&zK^ggW-usi}26LKDdzC8$%juuOo7JFR&yFhgW^7OBu3ibF+8JxLe%i@Ki1f;uLobh+cLV+u zJq<_V1Y8a`Uu^x|(;XvjG0q!ktk{ua(a?{2kDalw#x~oO%meacrEnngQ#Vo`jOV>$3)OGzoi$ZqL;gnx2VG4?I6#3(?=u#7VjLBeBliO7W(J~>=%~O z&2rvNQ``Esl>o^wN@k(Q;%Hmoxqu{n;SJV&Rs$KdKrQgF;(m)e4f1J_MmJddx^}UU zc)396-B)uzA#ASAc4R6_;f{fC&$8re1(*Tirp|ItxtVw{X;b2<3}7LM1|CM@n~ImM zTy}T;S|8`62cbjX2VL+YvQ{ONn{D;cCQ>xm&DPX*fRhre#SKjS3C`NXmj9**lDPvf z0`4kr?)Zy7Ab!`Sn6C76{a#sZ&R3m0Cyos>%gK@nBeqxDgV6pZSYEA7#7jWDtWIdI zRP0jK17Cz11dhRaxWJkNmGd4dfoSZ=a%zocK;2zD&scn%nnjXF=00hJnbkEySZ<$VUiC4|zw# zS;y=k4>7qCONJPUgK=c)!UDf<-a3rp2>xLGp>0*$Q@vzhu-#IipDD0U45SMnBi2Qd z-^_6MmGhfxg=Rv|;ZZLw<%T6xbYY1h&i*`6&Y5CXz%uge_II+Uo6XP-WPzBA-HkjgtEof>wzf zngwJh^&o7Ih(pBAwlBPA=e!RRq^qxaJBnJUM08weSiuky2pt}ikL5uLAwrV?@3y;Y zInveWx?0G|zWZ~0iFWAfCM?`V;ZA176L~xNi`d8l4s#~Jj0&5g-M?CeHmaHb4-_5k zX+a*-(vlJij3R+yG_^t9AupFR=FIv6a3=xpx*9OwLa&9+~=~T zDRxW_><_r3TQX0xx@Q^-$1u}?^flh zj_+yfJGs8T4<7LKUTOW^C5l;yA`HO$+Iqyw9SY^U`(V@Nv&G|~jo2ZGOyq%J0%DJg z1#P~%5ou5ioUn`Hxrz)}f*`Av_)t7rd`ya?i11=6!a9g)eS!b48buKvJ_u&kRB%uvQ8^w&XuCe=-i4NGwYp3~qODe0 z2i0q?e)LDY&%b1pW1q}Hw+l0s9gw#x)mi>RaU^Lp>-;miwjUnn4d+5uj7T-W9~vzO zi2VjJJMN3M{L3|;i#Lv>Z|y@y)W_}xe53O~>x-3JtUL34{Oyq+dA=}w%dhQ&M7U|= zyp|(+=omG2AW>>1_3q0VnRzJMTTr#Qp%6$NjO9H)7OoUReXv;}U>&;Ne;n)!Zj8(eB;bcQ=S4xbGk898x7337X^&8YU!tmD_@u%2az) zWhWGEIiZRr2_uf2v8LT(iEhh?kljkOd3lR04cED-wBynNo)bPa(uUomC5c41mJZkg90I^=NlJk=EIvl$9ujvhktM%Gj2AHx zk#t1#<9xnVmkGsFiB?q=Y=>${_9C@_AOA2>7Cco@(&B`NaFF%X*MZ4M4|&!5NCebE zu=qZ>Q0O2rAc2V!H8EIPU)!p-J*EUSYT6GoXgBOadu7lKNarULSPYU+j23dc% zqgEbV<# z>V-_lrAi;nB`ifj{PWHOQrn-GV`_=JKURmWFR1Gn-nO=_WxIiwB*IuNV3WNs*nuu| zg8LCJ$yp-KU*@#6uNy!KUn&Vw8ntZ$@;%SFg6_UPq<1ucH0PX&*`-VdV!ZAq4#|e2 z*X-apho~Rh);(_Q>_tl}`rRe9E)Q?cfYok-%15QiWlyXx;VC|q=tBsFKEA6@WzD;7 z8&ra4pa!?&58^NgeE7&_NeBIiq>!Wa66&`A6Q-^a=RqVQ$5ex2SvH zVlp`_&u|cN;dB;`7xN{etHJBBFOep3S$`j4WmB#Ehed)+mBq}3joEI~ShC4OmM&R= z8qm(R^epqd)I*ON$ono|6 zw^(Z}eF9GSAr-=)W4?eD#d_-N>FY;(1au%=15WK}LgA)&z~_@T38_Oe4FKk88qK&w z8k8S7U*+&$*&j>r*fvr`+8-`C=VWzs%C9Od`j76ebmZKoZ4J%QQJ+w@>DCy63{OWo zCUJGs?aeJ|wv7A|@pR{@oO2r!;=mheNgr({5C=Ekasm<42rL_rZ~=|dy|xQD@{abj zzzjlsq|kuTO=NPt5^KN`P&@Cc&Ztsyy zTA*@d=lzdn8qDvC8s4+)FtGz$q7P-v7fTR9cy5V%3Ui5Q?(C+<(6W0X_Kq!O*1(sEeex6*cUO^_W?5r{m2j(7J+kO-I<&brzfF!i)gD35Z7 zJS7{AAl?0$v__IN6sQGE7JAyG5Ozga5RnMByGMu|jC5n2>e~0AfEEQ-?}C5;3-aL7>_-vMvi=aylY>JY5TTe=Y{>cKc)uGO?K-3q?L9fF5XR$h zs<*$p-|fAtpMWPlZRCVwt~UU&1tG)taUiic;ZWqdpTD?TWAI3Vk9a(MjKC+!=&_{$ zXU({o6gU=cwIzbM2JpWXkyaH)f?#4?3e^Z-nP`$DWBX^5^lIQ`@UEC%9P8e`?8Re{@vSL4COS$V6g{!*{YeB%c?bX? zZ4Twn6HRBySdYl*U15RMDCauYsJUDeNW{%Q_Ewy%V`2_=46&6RNK(l^f2Cy~b;Z^G znwp8xm{QO@s^ZY8<5)kF)sQb<%$0aZ2TkDD%G|WsdUO&GmbDJE|-_ zrwsnx=DSpK!>BS2D)}7enhHPa!?tRB_@Y5D~6QG^~x6tO92WqkPWaASlA1su=RDd$J2`e{BqPB(^>S>9-u*tboY4~!_YzfPGcO8ytW z?ThrdMwtt$m8q;(=4;gQk+d>=}8X=gd{jo-_>VwK$$Ir>Hh?3jWN)IVhOhxbJ_eWY-M$|jM{lXavJTRt=Kd8(` zKJzCY-Oat#^fKyX!em6d#_h6)Lk!LY13aV5TFQLM^NdI(|n8DrS@0#pf)RL%AbZ*;ngF`NuyWNTaPlUXdH*T7a1#B^eJiy z(vlj>yg;Ec9*stod3cf~#rwbHccw;}3TmYV0jHU!Endq!7FwFF&+%}Z-!he4kC!Ra z$qU!US?M9mOi)RY-FjnD_hyeWhwIepH6E2?jttTmk zPs;+l^4Fj;8#w-!(*2I6m)Ht3OZA&CB&X3@USk=4|Bw!(sBqEOTRAnUZQ{PH^ukp50IK zq(N4UWgh2^i-LUI(xYxYZ4bz^!L3G*@?Ji9-@CchU{tn3Ac|^<;H1N z;D|EIsQO0cWzcCJY-CC&lzA}3Hs#Ho{0P$uq>!~#&5M^)B0j3jpJ!Q2jd+?VO;uly zbI6b>l`U`+W%u%o%L}LY@mgMhS)kx=>CUZGIG(8&9p(g2F0N7L5=s_?>=*WNE6lHk zMlGL_&yp)jAl2i1_&hbVa9+zhkMPc4DY%Z~0KXqsTjtv}Qirp_s#Krkt+@<6=hCy8 z()=l>o^+b7T4f$$YAzj=k#N;}oB@V)m^GeO<|S(RCBFk9U=_=Q$}b$doerlRZxE-PmcTnjSJi3zeuQI7ML##J#3Zu%r&2ErsYgA^My0iJl z=l#n3iVlxvn7_^0!?=dKf9CGX;q{MO4$_0cz&= z#v~7dWS6YUJBk6u-Ey`cALyC3?oaCvfKw~5gExh&Jh z0f5y?eUH-06(LqS=QnekHq=x*83VJ1PYh?7ONN;o`%5`|aL#SMR%Yo%r-==+<7rcU zuGvo?8lAu&%X~4we&_CZu2iv_h20cXrk%Sl)hY8nKGNX<$npCP^?F^Vi{?Kt%`Dfm zY8F9{K@f$*#Chuq-a5(~AENYGei>!J1Fkdc*hX~#Y={)vI0)Y5NoHP=p3eM*&o?>+(N1_BI_;EZz0tVz{*!eeMT$Q0I>hY_9Cxt47-Vyjm;V+5ZfbKEgD^t}xqFNz zevBtqOt1>L{dQVE2I;_~8D2iZ@#B0Zs-^JTX7~z)-=xjZ2ztsTP`El@7Gum~nTMnF z)yX{Sx=e;m*E`G|9Q;~F_D|G4Jivg=E}`{%=;YqGGM}cRx@-ZEy51(Nf)w~HU-=vb zuBR_!g*N>8Jh!0VY})4|w36jf0-e}s6CM}OSmsL;EZ};uNgY$eXhmrEkT$1y=ND9U zBbR=^d8nEp$bqym&+_qKQt5AMz;t7fCLtS9mjoflXe&fZ>3tr1t=>;7M%`v5uX-}| zNR}SWHI6yZFOM03IiG2u#9)zGQwM10dk^rvQGT*xmeVoiHTP2Gw<$H^GF}k#bcUYt z=5x`xCEyX&gis zfJkwgf&o61WiAUs9rNlRXLxU!>7>x102_ec!xUuLz8qI(h~|8UnrHpy`y7TdX_5Im zH^1TmkTi0~r$=)2z;e?<18*Jz+0grH#*fkz`PKw9!0-|T`5IM!kY(2Y9NXA zh}Z09oJ$8-W`KcduX*R*-=%0HK?2mML>z4Va5IaQSGN zy-W*-*!&shF|Me1+Hb~r_O75ZKbfV|rRIA)sMh9NOO5+3hcekkdeLH6N#!(aO^s#l zOWUUDaB` zLjSye^9oILRkLEK?YA@|y4?JE2LV^te-}-R5%)Mk94+m~W3W zXIjr#(!-V^URhy&7i8s1d_40UpIcPsr_?&;(Ti?#mL8scA-WpU zrjIuLzFwIH%KU-fE4=MxDXfyeX`h~E`W@yZUx@=qLr`!O@Z{= z%xV570Gy`|6A;L2ZUfkGXD=0b{n9D-m}UOS>%-Y*uHNqd%xdt32DXd@9J1&B5n`!V z>XCdsP_D;wbYrPG9|csbG(P}1{etz7n1&pqnqHiA*f&Gq}m~nT)LUN)Z9E8 z$x4V(T>~E2 zK=bK1qoU2q8sOE7oXc7{GQ}oZZKB+|kO`Rd=wPKD%C?0onQ5M7nF`(xxy_oTrL!+;hyB6llJVQT(l^Mdx3zSSt*rP^TNo|>s;-c;o~e=+L>kEK|OnFSXaf;WB}CFe0Xm_ zm0n2sxMeP)#*29=L(!IE7KV)5z_caYyq>9)-O?OlkX+^xuQJ#0#y^LF|Jed(&4;PO z<1;C$+{ZV#VwIknahTuGpc?LehFej$_7}^uV6l$6%y13zvue9ABh;8<4)BR$Wr-Be zJTG!Zy>N=rVd`e#Ao;voxKfXR^!{ z)v{E9lsZI#th+1Z|yyrC6hU@}$yi#YGL)0=n%T(J-REIK6c_U)H z095xN972t~H+9sQ>$!CUmCaO`pE4qWTs^ni+{n%MQ{RYY5lXdss?3~+nNU92-u1NS z#273w-|}!8wCsKL_W%I1RFAm?I!)svgdfk}I-yvp})N!t~V z1n?}D{Evz%xM3=c__@V*r-zcl@o(AHeY2FUPo z(@R^@J7vty1>nhfHtsO*sE3^4qxGv1Ug)_h_<23H%WNIy%^Y)#cD{?24QAOS(2QOmdOg4v&>)V;)gw!`H+2t%^R|1dF$gyP8niW<0kWw8aP6p z-${F3fZF&duTn>Jv&kD`Dp(*Vxt!T#_AgrI>vU)jy}gR#T|qdYC^Bu@F;=8Q83O;? zXz2Z(Q&(PAa#&vpv zX|><|$dJ9T5#U@9Viug{f&p1*I#8&~D$Tu&)+q#vycEpS@p7}Hk&*V9t|^-c(te-z zuGR6q<_nDB0pwB_S6jkZsPU!e^#v8V?hW)6ESn|tmu}o+?xXBT4P2j7;BtE!Zg-fs z9OiyTN5c)R*h+5mJ`< z3Pbl*sRcZ?M-Te-+)6!=Z!?+~E{d=o$rmJG-4x5ysq5G3vI0{uqRb-Sd6ic3yk`tC z6BUkZ($kr`evhu(WFFx=-ePnBNIje~8GK@KmH8fx8F84e#BElx^qahW-88JNANoa# zneR{8D;*jE3gHw^%&0r;|O>vDByv#lb!BuK3}rllHWTyN=`JmdgIU{*aM z2A0$q&%Jxi&PMs9_IpitfWBX3o~D-2CTLl6IgOljK^f~hH?l816mS+>mFl(e=n09V*we>viKYGep&k zRVFJA!R#m^I#00MeY!*dFFVdbBnn5e2)+ZTGz8fkvKOKey z3DNBpX1O2Qg1e)uWwE4f@znPlnTRz~@iXkyNuE}(mN)6nZ~0P`1}{}LSfr;cs892W z2{6?Z@&?-U-f0$HP)}ycWUz{V&h#w?%@5Nyb^T@l*tXkzJ#CXTIt^_^`FM$Wn%dtt z!h~m=?+-XkNsz`aH>c{CxZ_6{zaM+;k^RZiVTwCvY1DLutQgoUb9N9ued!_UC_y;C zp+07YZ8K6}ej2o2{~4vbr=MYFa4*$)U?_~U5qByTkJIn=9Mr?&GfscF4@2e5MQ1?gcY>k>8 zSuc`Ip!FY_gvLvC+NEnQ*VPqz2FP2XgR9I>p-#UP0=850>}m0D(KDGc zKX%8>396)<&kO=q>fld3=3N2E{vnnGXnL6}6(>EOE7X2ue|C*}Y=f@ZriTgziSoO$#xi#_ zs?s+oesvivWj(E0YgRV`w2SnFjiCp<=CbILq$IA|MmFFTikFtNBQb$MY_xJ&D&_nQ1T^?2!| z`4Kga?KAEg*x6BDDv{+l2Xb4b%l4Y9s}Tp*$*`Jl@f38PUQ3U4dW-fVyUjv}qIJ+A ze7Drh5Pf}2=0q++DEAmxz8QBIFC({;VJcf`>$NtTf}mXp5z}BP{IU-2*ERX*^8gmv zrZ6O%dUX+P)9EVh38LlD6UEdo)iKT?bYqag9^&DHozJzFG$}Oy@9&d6kY@=A{9L9?jD;7nm)xd^-+&xkU8q$iU4RIzE>nIz1iD z(eX+>?KBs2F}_hZuF;F^uuG(wwxB^bRyq3!i?xZ#xOm8JK8?B`{l`7LK_7nwvZ*N$ zA^C^vPzZ!=0KA!ALrA;{P#rerXFT}QxFv=M!ibM=*Q(NNtz)G?(d;m{vsl)QAl#P6>btUfY3{Z{XXzju z4}^v6FyT#s8X=Flrd}AG%{s8v{FB$zHF~DlOhpkL@cOUl;kACce2GYWfpgJ=#>cP+ zOLg@=5e3#{o2RRtpbE6~2wqC`Xckm}o~tp>utn-t>A-gLEETBhnHuSB-A)}?E*oOf zUTm`vWY^36`m4>`83N7#m?Nk@p*C)X1^1UZ%{!{ur3XrH=B*m^UmiWX%>0@uQWr^2 zC~zx%8glEp73Sw&@H_WbjI9@$aOir!`SlP>rbNWG zpNUgVImp;NPd`4u3PmP4?Gt&rfO-EU2#ShiWG7km&}=MqxP>=)Z_aCe5JHGWt36ET zD$`Ea$M)zkw@|->aPKS4dlkYe;LHK%pMqEbB2oM_;OSW35T>8m&lr`O>ul#Z$DThZ-B#nW9Odi%=?=z5oMVVZK2N zuO36aGi;f^P-Z9>%N;#@jY)^7&x1NpFS4XcOh4tL2w^E+ZP_ZQM=8#hg-X%^ga_9G z$Mis{Oj8u#UT-T-wbx_ox*(wSNTn&8l~;89Li1z|WA4z6+s%7vZfHFiOOVA->Bi+c zfJS2-`f#CpUu#6e*C>q>t-`? zCbLZ`pF}dT`hyfOMRcdGNY7kt9jBYt~bd&jHkb>Hrjf(N14waxbGQ-^C{V}29^<;^; zYk)mi$Kolmtpv}8SX32=Ju@U*Q3)&pvz#Ez;pQspd9Aa z5xxcQ={28YFsf}N**>JPw8Gt_GqJBT>&RIaMtwb6Judk<8^F?|z3vI!wQ`upd8uq zyvMA2P;l9l;?S`MhDPQpscDE`!6muoaX?*wF#xjKE#Pw}hZ&&?^Q{nD9vD4RFExIr z%$5|_I|OsoH8Qt)1X#JkZcYXYolJ+hvJ4JvlP)`Ko?yiI%vgn)up3l*npV&3(gT@d zFY;tS6>Je%#Z%mc1RA?ak6xmT^FkbgdcDb_4MBT*j$WsS*Ne9IIV$GsjhjuB74^J5a<)!) zktXKp>E-5UOZQ%21`f_5QlO5-d~xI(mUF%QU}d75tWO)LDAW55ChZz`qCBB*P8^EHwA?0Q7VVYvCoT zq@>rF7pbajlbPX*+*n!-Ax36{fajM(Z;4=`A~e?`?u-| z7|$_;LG%fU;XExHk*(yaUS8n3ov&>{`ROzZe5(Np<28!ayTk+>I9AXnR^sae_JGvg zWX=cehEN5ixVs2d1ywNyV6D_O?K*Iw4m6mYv;bD~Lu%YT#ZC<9xlFTbk&WPlN-(dD z^7%F9^=arRyP`>GHIa7ek!JI08lQffo;jdL+`4hSxrf5jtM$m+&53c!Z_(*mJyvN( zX&!Bv$wrT0o~m=0e1PJ=rk8AkQ_^<6k&7_MoMs#c1Bj>@HUtKr@q=j_sguPQm6>`Y zPEW2h_4U*q(eVr!_Z~X+K#gVoU1Lw?!8DS~5YIDo6lDVzI6#4#?K-eo_(Vqy@|ijr zQP?r?6=*om7IROq_NvW2W87PAQ?m7Xe6v<>*VB3C_pCopg&4i-;0^;LE5-nW`^>gM zCS<>!SZ_MeKf9~#@mYi*44U`RL&UwVv31Tk19ljFym@AZby{Px=n&$Q>Ksu^k1`w4 zE6pChTZZn&ZJroF#zq6@T;>qZ5e{nat>#HO6urzgy1F3<7!09Ytf!hv^X`ChjjVOj zCN>zDjgzv~dTxUbEYqQY4%X|sbs~-^%g3g~+!qCNtkU(%#VTkYEu+oTqFuD-eqfoo zeb$EN;MhWza*>Da>a`7!iy8gr8Q}m8F?xKbC}vgKLWPRK9vs?U;l1SR>S|dWGa>EY zU~*=W`SA%lF}hxlZ_pFvOQJoB1@zcP?cHLIfcPdv6JZnkWfg+u$%_0 zjQ(g5DFkyAC=uH~F?REA)1z1F`V3PMVgY5EN;tD-%5@xmX0;iN^YJY0 z-=XJV)!cfj0Ii;$WxNjR^eXd(Y1BxA2-_L=bu@eyGQlzdzlar)`+Hbn(w_jsVhg^? z#8^g)_Qh3)#E0f>h}y8mj1KZ1*Bjv+d1K}hS=Is3F3SlIZfFE~?OM7v8%NlXJ{`YA zs)mjeIB6H0yje$s=H>Av6Z3CV=`m^utTL6en5R*=CP&;!bmKPLAbb?kz?jcWGxiBR zbD5q&e&WUsN7t+qrO?i_-TboXe>hpO4~ioQyI6E%Oc(9?*sMKq9+Y7LL41Q(Lmuh@ z%Uo!`YzoJnwt>+_w8LL7Gl6rvy6(MZ=4dRB2ai0*i|p|tk8LdQCU01Hm3MiUF^j-}CP(er$B%vaKBAOBe2}KD>5NRO_X;OuTqNHh}y!NHjCMA=+^yPie z9opBIK0bJ6^k43B?z!il<#&$S^q*5;Qn^(}0*%!+I!Mou+-zx0k=h(x%@o%#qF+W> zN6}&#=W@wGjx8XoYq@OijbmF<-u5E)R^u=Vop~!ZhK8`(gEsEH z_GWsnl+KgLF$rC(-(lf5^V*m}RJVxH;U>0EJ|)G7UKwV&iAKbb*B!)Y0B%c-=m+80 zfIgUmp@LG5=_MJXx)vfBEGb63NwZ-96G;6CL8uTnx0$}-%-|+gM3x8u1N67)wFx@y zm)ZjJGglAl&Nv-T)Wx(?k4t=$6qiVCE*zmRGZbQqR{?AJVxF}=q+{8((YWrSbDWyk zM1+hkc3i3z-uJsTy^X&D!zHd#-K*=;Xx8XQUTKa}Zxo#%ESaEQ#(2ynQ|dPHo$*U> zt0d4H7whFw-bU_K5u^CQ1G<~Z1YFyxbq2D@28~8`(%$T|Sdr9Vc(27B(+;;D#0^sf z{E&1JtiGg=hGu~0q%jt#%c%WE2BA7nve)STICuN>AS3smCxc^br3zyt>($#zY;^?M z0=++v+<_8&4oYUN!P-sL*^P!^-tA~iGCNI8&2QExQf-o*uB%cVlJC*(#85n_+^1iF z9~Lr%$_%|G?C{i*W5s9NZjE>7m>6#4Jll@;lIwvbNx^Gz`N5PlS8I5X{pyzxAh{m> zN|AAl!=K-#yO_#5yiuHI=f$AcTw=9D68)&n!ZSip1HvQC`%}T^j8taREtAw_=idmJ zy%qF3gw#=UiO|;U+ECmACK;K;is{kmO_B6&eUpuKBGiUv`zY$nVrgBa-wt{7dobU8 z_QuJur}rmIIR9^ejI@IVA=T;nUWi}UflD>MfH0v=#L}g;Tgv!bj@ook>F{utLwXbJ za9_P$Z|2FwD2qM=yTsui$9e>s&}R%#PUDC(^^{P4rc-ZCLljTPsy1(~%}gho@}+vV z0(KUXM6R-dgk!varJi9RM7JUvC4H`hy?QBzSi~y>tlbK!;Ro46FS_An*l`4X>!TUh zT)s=vK~w?`nK7tZdKc_KF9{cKc1c&)R5(G=!7i~VclHuMAJ|<%{Wvt~8Sv=G6Yhl= zCJ}+8cDU>E^vn^|qcMa}FEE+u`Qf=)N3R5yOKO?;FLQ9dQwFk2 zq7zP#)+#B?vj}{|xogp$Sd}k?-Dlw9&j#FIdF-4icJk4Ry)_Mpm9)kiHlQ*={}fSnJzX`P;CrbBJ}Tom(RTAWSXx%y=r&mfB6?k!0TE2VwKho)q7q;9?97sZKM;{s1S)EE!lN&8W#>p8uC5JUa(xGsHY41uQ38 znruKy?n)BZ{1Sa1K~g`Uq+l2fFLF6ZgE zt8JR+9hmg}C@rtlvr{b|*!NzTc2y5GP=4kU18}A>zElTQ`%#cZ<~%$rM;NU2+PcDr%wb&8lbc^L2?r)p4Ab`f8a_$6kqHGWF8tBmeQHWeZYe$@dH$yy|#ow>bLirD9iGJ=Wl3auh zZuonX?{u5R(Sjw4YVUx z4pUg(Z*I%M#rj(wN1~<)&yO{-D4`u1p!0IKdPBA=#IxT<+eu?M8)EDA=Vj0-Hvl?WQadzQ3mJn`^}# z${IUKcB7Q$n?$6e4_Yk1*6tE#l2>P@IV;1Gyw_X<`FkaCgAKX3OV5vDNFa~=s7jAW zd5+6+%=Lq4k>qwD4pO*9XC+)WDUwXim|uK03_$Rl;1V0Zobhx#(e7=n(zy}F5S0WS zOp`(U#=RO2!~Z=v(e+!YHXYBLqhs{343+}V)!x&2OraW0)Q}{VVL+CIVgvBC z=7Fa`z>NMPFvOr&Ylti?{0f3C+@%*XI>=Yqq=NxEo~QS)Pu`0`jFA$%MwbrRV6St5 z+srl>aHvuy1OYNIuQ8>ek9|vw`}f7&Cio54K#WBKsQ6p-(`QuZ;JRg>VZLQYObRlzn8#c%;2{0Z(WlZUfj`bP%MM_Hu%}MlAL=ZfuByw zB$y@;QuB?&knQ!gmS1oHKHJRthjv=dh2f~$nskL`)o%+$M=`H%g*vqIJ6;6;vW=1V z7TeUEV|NBwIIlyahwv1M#vw%H77{%rh3bXNCp}RaUDrZS zOnSYQez;vZWqweJpQmH%P}V?e0Im!fEs6OO7_C3cG7v&j5Zf!MZ4zb2ED+B{BsN)K zxJph&ieEi0OEDHU-$mgmy}ae<65TD0eKsRfvT#L~Qc|5@#JM`bJhGUm!=%7&65@+e7HO6s)o``__Pl2opqI!WPzFs2`9;us$%g#Xk zR?@~#^Eo!9gJ6Z@dlntL*msUg5KIHf>asrS=ZmKYyG?J&a}unO47SubNsl7~Q&OAEYmu2n zCXTuJc)CdhXMz?Hps|pPUuADI4i0jml`vx@==9y90F6@0Vw(H^c4s*05$4w*M<@+> zMFvFg47t=PYu1C(Pk;bMhNb(YuuKN#>qYG1FV>+{hhECNzE7DiI@{(AcsiJDX1v5# zYcl5a*(>6-jT_NIMx^x9;8CDN`jwQ|B`X(aY9fiQL@TZN#?9|V@p_=cuNH!Y_tS6T zs6N0rYQ6d(Ej$%No2+3|tuq(d6hL9|C3q(-%Z$l>Hx7+`#(Gj-pud6I(H&Br&Dyl- zS=JL;;6DHmm1I!MrM^otDJ^8&a_&oDy~k{kJ$T5*Nocu*&+6V1@nU%MQwjIbj~PjJ zg|4k4rNGmlv`$CG_(1Z^&AQL;T&<+pR(kZ)^d6ofrP1IjbR&I`j?0#j;p+G0imsW+me@h@->;kK_~`uAgD*v*_=~be`?j&+)5B*y|?A zot5A%65fSWHwJ0?^=aCmi{f}4l_qEs@c&?OAizg!0sASfenqL6i>7-BzUmWMq zRTfkZc{G*mz*ujW7PgJ!c~M0|yLfa zwrK~XD_FL^L2x4MrUxVk6wSGIxL74mv|X}iZ6*W!u~}cOAV_EFSCP}P9;pT-^CZ#w z(JylD(&R4O*9cG)p2CGv1OE$E7TYqu@8<}+Q8L5_o$uZUO!cIxTHrQc$-CA6GSu+h zq(A3$`bWCNcrN)hd)C0*ay!zuwVubcl6Q7lq(U*`X3@py}Af zz0?puc17)++Ma}ou%rOj4hIc@F{`gLamMv^TLOqTO=_1~7HJG73kBHWZTd!*e0PXF zHqNF4w>U}a$0aw57>Fe$n=&51O(HJl@&SWIRWySbbN9`dWgUni{bTrt*6GI?+)oGF z^dD+W>n79p>G>}c@XZWF=xaTo`YMoJhhgaTl7j-E^ zPdY4@Z>n=bH^wt614w`;{zwUM!$@wxSh!+9}_=Nx8Zs-T*Z_5 zk%pc7QO?oSwlUY5=YT%dzh~@U2;+7NO5n3NoFx@B_+f*>Cl$DpFiN{V>cN|FzP%HG z%`TK;Qh!QQh^*9TBTF3nq)mSYJsNn}Nv58!t)S~8%J%htGNbqf1>{w<1dc# zTP>9x`+xAP-aY8S(+0^OmaL&^kqBPB1ga!P8w?d@8c2ZGW?z=vr+=k`bcg zjTrb^>F zJ21dlkZsUTR{$BHc_AdN<@(zY?0Sh6y`XfH6x>GQSc=^*p3?@tqz_U^h~%SObta)V zi6htHSa!2ElaCj3+i|PuS*R13Z(iaq2*-Q5Ddv(uK^Q^yq-i?mFw`Bik=dg&s<3ZZ zXP41SXvEX0i>x7^)Nv`yay15MAv(tLQ$h)PAqyNAGm?^kJu3e=9R>X_jA5Sf)Z1JZLV-q`lPERHkzKlf)z)51SgRWL~kYB%@HZ#^9>J_f7&LIx{XnE8=a(2 zYhtx_UeCaYc*h7o>@b+&YjnrEbR2-ZzJMf1ir;LzULtrBx=DeM#umd7dlnlnHi*>3 z;C5VT;QuC>*u(+NaQbKnFEHVN{8as=kG(cRa)+(7#{NK;B#ufH0R9T+}KWaQ*29t-nr^U24*xmUexmDhbNID&37$9izL>gAyWH5q(%@RmS?L|GxrLveWD0!8Zjj6^<>y%Ngwq4ic;F8C6 zEz>EY(~XksXAGG2{vZL7-R6RhOw}`WXt7b)lIx*Ly-$Mc%qcfW=xQv$WL^V~$mAOw z_eV*4f;Lu3i5~hsL%YFijKLe185%D-S_b;9>*7L*v*|9)+9Q zXqq0)F=h^HFrHatF=rC-oM)|{5He<|UK!`jJtlFvRJ|NXj8rZ#lKxGoUH9;cXN)ml zX#w|jFYn}7-cb@bW{AaGc@01+I!*w~Fh)x}f4d%Fti>)}6oq`u3@2R$BVuZ&RQ9e0?S8rl@vl|k!jS1ZDwK zXp1w5%z$nwKSo{#F)iY%4yMnG#EA{YGFmFTO~Z(skp_ycU*dmaL%t8G!b9K%{SdxS z2C#_7Yn}!DB^uXfU<5kyj6*Nt)N9j+pjkM=vi9lQcw2VHSQ?_<#Nlp`=2Mu72HUZ8 zxg7pDWKGa24%%T78ttkUMu+S47@izIE*k@|bIke3%wX$cjI*xeq=4fa=Q{vupI$M@ z$S&3=F`2&t2YnHJsCK*l5w!Xp`vDn{KVz73I~V`K2u+0dTq%r|x(gm=7)f}W6jMb1 z4SqE$%ZhB2)_TyQ61>6$V~S+rs*%@mdD<_Ysy@j~3te364)8}u#~W>4Jb*cpyTV4P zT3nxEO0{mOUrkbGn{JF@M6%_kgR{i83!$?C4n_!bRFE(LPIa6sTnTVQOhaK&Eetk7Y8v+oKoxf08 zN2Rc4$OLrKlk=~+#3CD6mwf?9qRR7yhpmZXyIz)Kn5RHqVDyZU^mJn>?wH<}VHHrsTBZzVA0=*17O`gL-IGE`8W3p|AB%S6({f%vhQZTc1sUdkp+p)xTdOyt01 zeUFvM@f7smZ7~t!MsC(ARWwEK&bcWh=aT&b2wfxLcG4Ue7Tw{VDv>Y6L4}UkE71!a zvXIvkS-M7#z)MEii~DUbseT#wzNGd^sUH(U<3R?2x;a@Nt`KZyH=*33xp@fMjMnd$ z-2|e~n>O<%lpzAn;gSMp(W#d+FD$$KTq^TG7W_qAXl45>$_5X4h@$?vv?%a%gFeWn z^e@nNV@UVA^chxzn8H`XbatL?vA}4P&}qZMOEyG;nm&f=e=tQLz5#pLVW}Yq7Y`Bt zpfnIOYckp&@e&Cj;0P-#^(R3#2G8VyX;NCq5(K5XM-pg=??`sCiCSp>cHVv&KqH2; ztyfpq`MlP`$_&jv&yef-vkEM>+S1J&e3{N(su4Ayz$~E<-Vc98Gb1!z< zss~z8DbkasGI`uEKj-O!d1CcD)+2HtYgmwC)`f-hEA*e@to0m;Ezov&D)FY(x3c9a ztkB&>{vzG*v{XrwjkwE9?h2Z06(x9{WPu0(p6k^0DeRo}`pX2{>hos4qTt|d<(vZ< zRk}kOv!t?I|EYu|nkk+Mrm>cW8Hg%~^r0ZipAnMNPY2rcnHGmn&_H6vG8s#fxskLn z3{d?x8xvI#ed@O?8X@BxKD(U~d_q#IY`BekH3i$BVWOC_X2kkg0;0W0?&11pO*UJ= zkTg7+#4VGA7I?@_p6M*sh?#Qe|33fW90^=lV&%W{nS`A)>{DKvnQ|pq``WF$Kb7Xy zdVmFL?gZtisAc+S5HWJTgg<8I0M8UUTuyjsqPEk#kJ1keu#2`xfKA*5EWJcsspXSET5O2*&i$>4K*t3yC|VOayA4ju;AWXC~W*WvrF!ku9Je zoPxpGI1+#W-UO+AO3LIUj5DMK2;E6(a<#b3r2TOgF{!=0%!MY?o!4QNtJ3TR=*nfB zq9mdIB!e3X1FlEMFq+sVEAi?|X0PE!V82-<=_mDF(_~3AVvglMOP)*6)U9rr&8Xh zrDBH$ldS&?y=*5QZMRm9^^)H!K5*@#fAS8{pNKM+0zwh$(Awq57a?jTSWs_0CS1Xfy5g1~k! zl-5dXEFmXh`Np%Z)Yh7V5Mli2@4h9*c$Q=mups2xi_v8Eo`{N&0nm zSbmJ23o!!}m>6MwSK?CMA%O>^^*u6f;n&3mn9FQ%4K6j^UN5W~FL8uYg`v)*sXd8F z4Lz@u_&205$tA>yTDq*GpyqMBb948ow9U2Y0io~Y8)b|<+WOWb@r(hNTvoGPB^GpHApOk^E z=8$QP(8I6=n{IEzopuFKHwB4ooxirz)5d9azOM7bHXwXJGt8L!g<lJt0~9n_a1^b*kPxeSWJkBKu+H!>dl z*aKmW1?YUV)sXBT+3^FXkpsHJoT{JGJ87n8nr90uw3$`QkL-Jrw?ag%LL5L`7-mHS zBzvXQwg_0BlH1pt;-WivhoX9n zM+U}GMno^+uOjKN;?i+y2PJTg4BP{#qIbn{k2fC$qY4Nnb#CtgsDM3M>9<&5YLWzl zfcq>zU0;K@VAP8l8VmJ28CH+gAPD0PQG*^G`C==ykzi}G7{sxA^^yt;wMkFrcx;Bw zV@&lR`ImYpO%!+PrDadwWj-E3CSD=&nL3g7_*~y)xJMQrEwjP6ao1fjz%ye(B1-lo zE(llB{)z(EW*8gf!e$;fr|V5k=DXMmDfz(*T_T~fCLJo%D@lSd0w`LvUL=l!GX2Q_ z)_9@KV5bI&sT{!Qv3fI+k-)eZ$lbcH;8-}k#T&f4Bi^P57!1k`1WOn_`L{i6C^3=o54V>eGX-i+qW#$hbMKn|ZG2O3yE2 zo{@zfN(Jo!n$!u94hEj10*JrFgY=XPKy4PE96(o#}&W zbFIo@e4$0JYZG1l*;}|=Ge5A6aSJ5e$y>guwv`6Sn5uN>ambWnWLHRXv3@jd4~lP< ziQaWVX39!!q5L9_p$^T481XgYndt(&B@)&ACd(U>fOtuIr;CP{R!Zp&eH61k)38&~ zcHE0hp$d}AvXT&ZdKdjALE7)eog|HSr94qGWIL1I2AtIoY6Lc4qJNw8^uOEjGNd`_ z!kuJ!GBlbBoGaB=wpVJ8_ZNoIY7hs{Ov%2YyI4NdON)Lt(YJm20F%j3 zD+n{@5zE2Vhj58m=rfFmF9rxqFDOE)Whf4GqmRTKA;LXUyQ=rS_dVVy@%v@e)vzFg z`B7*tNm4y_E_TOKDRfKXw3Vi*by>2PiS_&_{S;5d7hC=Ne~&n);~-&Az`_wnh9{T{ zpMINy*%w&xnFuP%ttJsH?UeykhwBThTx+=G^A=l7SY*no)UB@J-XaUaoxSyq(%7s| zrA}$Ujye%M-1EAs34$Xr6N7nxtQb zV0!`)1l^WOX_dAkb`tkXd6Is9khg#z6UCw8KTBF!U_j*b3G)VP%GYU?c-eYZI+WAZ z0aw@;-1{wSTK%S#;Y)W}_@-2IN3VJO@vg#WE`YYH)vi$=>lLycI-v!-@;WYIf^|5i zYNIUo1FL9$68W{&63hQQ51G!lMZm-Ih98}3jeaT2LcVXhabT3b%u;`~#=$T{l9yTV zKb$h@>AzL#)+Kg;J{~HZ7u3iAc1Iu*5Z%}U22?uHjA3}sHrI}8V`cZ!&@{_ z;#XLjd8_~m+1<|9HQTEGa>&C zlKQOK%Xp?&f$leVz z3#ZHxzbv=8z*1|jCOvXw(bo^cyI5^dL!bQ12i zAa<7GHgRK_wHfx;{gOCrbBc|(1;h6rd!3>WZckkv#S%Ad**bvR=(EYvnx$G&X?&6@d+oZ0^~^0b zPYmeZV5!|;%^h!q(f+Ia7tn)gv zE+A?mFyeModMlXU6ZGRn)L)Q+lPuM9M-n6t)COQ2epo*UqGFNe93;EiQar@JRgy1A zYl&sbU~;!8GPKkjQ7t@{h|IB8cp0anh@odvB>1CH?zMZkEBgjIrO@fB z<=YVMA^}#{GF143y+HmSWi$S+0ZBI)EXp4c&t&bXJHS+k^3=4RJx#GWb2WV*9ORW5 zX9T7C^fV_af`|$sskPEPsIS*)gf7vO#_AhJnIIVpI-=-Aq}j+!W=jEPBmHNN6u+t4 zVKKO!ge=%$z7!_A>c~GIV7?#*iOgI__J&QP(PJT^`g#q;*>3^8ujxclF`>mOcV~mL z*HKYcn};}%*&U$K_mRwGTr)r6z9)we3T&h6@~C##iVsh3*pWVxGOA@5IXff+alE9^mvU{;5O(otZMHFV>TcDUm5~nPV6u|wnrIxS)i}ehi)E$x_UU|$4 zwpM0I>VT`DgmXVkFDC{t=O+<n2w2rwIVC|3zlgtq5{n9^s zybN!QbtM@HaCLI_hW{^s8y?3l>cbWV2YtJW@K{;6--D4~5NW2O8H1e&SV!N%&g z3>R9B<|MSwDuu+LGd4U`Z}91S#3}(^^@3;0e0=Mspb!WfP)RSKev-sLg5z2jz-tSc zgOAf-Y=u*AsM_#0x?}klGugs8Jrrf#(6m@0P}}f+@@hat28?29mn7(N7RI&>F_uIH zlae{)@Yv~vF;CxeTEE@nAgoJ6j)Bwgy_M>1(;M09k;8T%g`Ke$9Aa?JhCa)Y!S&I? z`3*d1qGY!dI+glj{TJpP9Ai~=r7_=%kp^9L+eyQO)f^Pt8O2a}d>nS--BoXb|7wPIaR0V8uz2tjy#Ay@)ZcH(O#Yvc%=Q4zhd2 zg_|>^Ydvc2Xn-dk2pu8OaJs~QOE|`Wn|{#8uB&>dMw-YrbnnyZ1*~$Io`(n)it66H zr|)tuuty>%ET8I!G1v_|VzORAqinR|Fis%7GUOTq;1S0aJH`%k$|2%V8W^uv$5?Hm z7$bC#J6H8LAsi*^+&ri0a{9`3=s!V|YndCj*jWThyWsxshJoMGdzmzf1$vNn$au;B z$ch*>snQQv)@d`bg+vGS3~!TXQo2=&SQKCvkQgI$dND3;{b?YguzW8X+*E-*+y9tim^-zlN5 zeoU{Aa7r%&9Lt!gEVoutACuI};(37FPrZ@xSLSPmSLv_ul(jP{iWP={zSzPNz7Y#*+KW@k%AQ>B2RF0r0w65Y&^0>ndvp*cQFE>jfeQ)RAS;ln-Bg}UG#}^FY zU5kMsr@J&(=r~Y1oi@MvGO1o;YE_188URJAFBMtL{g&!aU;wKw=cZZ>N(eB>3jJIS zb&A%nNW$Xd4DV@qq~$}dIMV{UEZnX4GPwOwpFR^unh{xkz`#3@48^=M(lo1OC?pMR z`d%ro3%M!!*)k%*YZ_8^g2`sciT9I_iAnl!9yMu> zevlSVyukeXE6@)lZ>Ra09>)I*or0u-qvejMVS) zS%G6+9U)OJrn8w}_Ie3W)$k7eG!OYXBPcjFPtv3WTxaP?571Sd&5vv&-{-Iu|4Zfg z4uLeIGTfMra!@3aF>5cm$G&^UF2!bY&uT2%uMR&R_z~2M(Gf8Qj-?5=E*hw^=s|$25=o7{t@Mv4yjTvaph!#nbNKs3t}pp2 z`|cIplA%c)$W*bq#IQ!5Ya}0&fdhQegmtX96c{eT5;0q#YIc@Srx=?NrbH0It8*e8 zFi22{Ri<7>mEn1%c0vZA5I#;$2!MVZ-l(DgEaFmnUP4}r2pTtzmQ13z>$SFNo>%>N zk=97-VQEfOe;s~aZP0r?<{7Rl_(*1C^+HP^1EFa0y`C+z#}`X6>b@xBoggbTo=hAB$R7OxgEjakW(JOMFbVsu+UmJb zfZa&C2|kW{BSbl8xl&!soI8z$y0e7jM5m*;LfvLhwTL4~lON_F*;Ms4(amp_=I@P+ z=9CVSgF>|b0?9pSCJ6AS(U(>|9SO_>g!5GQr3_JAhuylN!B|)Fy<1>B^fW{Dt+RTy zf%D8q9vPl}yVLXk{`Y(Svnzi7@x92NP7lf;pSs;+R8{16h=0PrHvXmB`N4lp_tWD^ zFZuAoho3#3F840_34eJ!pXH_hc-Pvr*6p6p{XNvLjVZnN*nz#*dpyT^{^tI}e5-Y4 z&;H{_Z@Bi@_5DkOyRVuZ>>v8L$K5;l2Up$4oBNOP9q>Hf&(|lPIC$W|;gkKmcj)lJ zqX+ivxEP)SH9r=dcGI?zxVlHZu~)yhmW1SV9|f??>{g9VRw!WCl2mCdHe(;xViu6 z;e99eo;YoDPM_X;f^U-e=-07 diff --git a/clustering/fit_tsne.py b/clustering/fit_tsne.py deleted file mode 100644 index 55d7239..0000000 --- a/clustering/fit_tsne.py +++ /dev/null @@ -1,34 +0,0 @@ -import fire -import pyarrow -import pandas as pd -from numpy import random -import numpy as np -from sklearn.manifold import TSNE - -similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" - -def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20): - ''' - similarities: feather file with a dataframe of similarity scores - learning_rate: parameter controlling how fast the model converges. Too low and you get outliers. Too high and you get a ball. - perplexity: number of neighbors to use. the default of 50 is often good. - - ''' - df = pd.read_feather(similarities) - - n = df.shape[0] - mat = np.array(df.drop('_subreddit',1),dtype=np.float64) - mat[range(n),range(n)] = 1 - mat[mat > 1] = 1 - dist = 2*np.arccos(mat)/np.pi - tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1) - tsne_fit_model = tsne_model.fit(dist) - - tsne_fit_whole = tsne_fit_model.fit_transform(dist) - - plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']}) - - plot_data.to_feather(output) - -if __name__ == "__main__": - fire.Fire(fit_tsne) diff --git a/clustering/umap_hdbscan_clustering.py b/clustering/umap_hdbscan_clustering.py deleted file mode 100644 index cf4acbb..0000000 --- a/clustering/umap_hdbscan_clustering.py +++ /dev/null @@ -1,230 +0,0 @@ -from clustering_base import clustering_result, clustering_job, twoway_clustering_job -from hdbscan_clustering import hdbscan_clustering_result -import umap -from grid_sweep import twoway_grid_sweep -from dataclasses import dataclass -import hdbscan -from sklearn.neighbors import NearestNeighbors -import plotnine as pn -import numpy as np -from itertools import product, starmap, chain -import pandas as pd -from multiprocessing import cpu_count -import fire - -def test_select_hdbscan_clustering(): - # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI", - # "test_hdbscan_author30k", - # min_cluster_sizes=[2], - # min_samples=[1,2], - # cluster_selection_epsilons=[0,0.05,0.1,0.15], - # cluster_selection_methods=['eom','leaf'], - # lsi_dimensions='all') - inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI" - outpath = "test_umap_hdbscan_lsi" - min_cluster_sizes=[2,3,4] - min_samples=[1,2,3] - cluster_selection_epsilons=[0,0.1,0.3,0.5] - cluster_selection_methods=[1] - lsi_dimensions='all' - n_neighbors = [5,10,15,25,35,70,100] - learning_rate = [0.1,0.5,1,2] - min_dist = [0.5,1,1.5,2] - local_connectivity = [1,2,3,4,5] - - hdbscan_params = {"min_cluster_sizes":min_cluster_sizes, "min_samples":min_samples, "cluster_selection_epsilons":cluster_selection_epsilons, "cluster_selection_methods":cluster_selection_methods} - umap_params = {"n_neighbors":n_neighbors, "learning_rate":learning_rate, "min_dist":min_dist, "local_connectivity":local_connectivity} - gs = umap_hdbscan_grid_sweep(inpath, "all", outpath, hdbscan_params,umap_params) - - # gs.run(20) - # gs.save("test_hdbscan/lsi_sweep.csv") - - - # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom') - # job1.run() - # print(job1.get_info()) - - # df = pd.read_csv("test_hdbscan/selection_data.csv") - # test_select_hdbscan_clustering() - # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather") - # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather") - # c = check_clusters.merge(silscores,on='subreddit')# fire.Fire(select_hdbscan_clustering) -class umap_hdbscan_grid_sweep(twoway_grid_sweep): - def __init__(self, - inpath, - outpath, - umap_params, - hdbscan_params): - - super().__init__(umap_hdbscan_job, inpath, outpath, self.namer, umap_params, hdbscan_params) - - def namer(self, - min_cluster_size, - min_samples, - cluster_selection_epsilon, - cluster_selection_method, - n_components, - n_neighbors, - learning_rate, - min_dist, - local_connectivity, - densmap - ): - return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nc-{n_components}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}_dm-{densmap}" - -@dataclass -class umap_hdbscan_clustering_result(hdbscan_clustering_result): - n_components:int - n_neighbors:int - learning_rate:float - min_dist:float - local_connectivity:int - densmap:bool - -class umap_hdbscan_job(twoway_clustering_job): - def __init__(self, infile, outpath, name, - umap_args = {"n_components":2,"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1,'densmap':False}, - hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'}, - *args, - **kwargs): - super().__init__(infile, - outpath, - name, - call1=umap_hdbscan_job._umap_embedding, - call2=umap_hdbscan_job._hdbscan_clustering, - args1=umap_args, - args2=hdbscan_args, - *args, - **kwargs - ) - - self.n_components = umap_args['n_components'] - self.n_neighbors = umap_args['n_neighbors'] - self.learning_rate = umap_args['learning_rate'] - self.min_dist = umap_args['min_dist'] - self.local_connectivity = umap_args['local_connectivity'] - self.densmap = umap_args['densmap'] - self.min_cluster_size = hdbscan_args['min_cluster_size'] - self.min_samples = hdbscan_args['min_samples'] - self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon'] - self.cluster_selection_method = hdbscan_args['cluster_selection_method'] - - def after_run(self): - coords = self.step1.embedding_ - self.cluster_data['x'] = coords[:,0] - self.cluster_data['y'] = coords[:,1] - super().after_run() - - - def _umap_embedding(mat, **umap_args): - print(f"running umap embedding. umap_args:{umap_args}") - umapmodel = umap.UMAP(metric='precomputed', **umap_args) - umapmodel = umapmodel.fit(mat) - return umapmodel - - def _hdbscan_clustering(mat, umapmodel, **hdbscan_args): - print(f"running hdbascan clustering. hdbscan_args:{hdbscan_args}") - - umap_coords = umapmodel.transform(mat) - - clusterer = hdbscan.HDBSCAN(metric='euclidean', - core_dist_n_jobs=cpu_count(), - **hdbscan_args - ) - - clustering = clusterer.fit(umap_coords) - - return(clustering) - - def get_info(self): - result = super().get_info() - self.result = umap_hdbscan_clustering_result(**result.__dict__, - min_cluster_size=self.min_cluster_size, - min_samples=self.min_samples, - cluster_selection_epsilon=self.cluster_selection_epsilon, - cluster_selection_method=self.cluster_selection_method, - n_components = self.n_components, - n_neighbors = self.n_neighbors, - learning_rate = self.learning_rate, - min_dist = self.min_dist, - local_connectivity=self.local_connectivity, - densmap=self.densmap - ) - return self.result - -def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], - densmap=[False], - min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']): - """Run umap + hdbscan clustering once or more with different parameters. - - Usage: - umap_hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_neighbors= --learning_rate= --min_dist= --local_connectivity= --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods= - - Keword arguments: - savefile: path to save the metadata and diagnostics - inpath: path to feather data containing a labeled matrix of subreddit similarities. - outpath: path to output fit kmeans clusterings. - n_neighbors: umap parameter takes integers greater than 1 - learning_rate: umap parameter takes positive real values - min_dist: umap parameter takes positive real values - local_connectivity: umap parameter takes positive integers - min_cluster_sizes: one or more integers indicating the minumum cluster size - min_samples: one ore more integers indicating the minimum number of samples used in the algorithm - cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan - cluster_selection_method: "eom" or "leaf" eom gives larger clusters. - """ - - umap_args = {'n_neighbors':list(map(int, n_neighbors)), - 'learning_rate':list(map(float,learning_rate)), - 'min_dist':list(map(float,min_dist)), - 'local_connectivity':list(map(int,local_connectivity)), - 'n_components':list(map(int, n_components)), - 'densmap':list(map(bool,densmap)) - } - - hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), - 'min_samples':list(map(int,min_samples)), - 'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)), - 'cluster_selection_method':cluster_selection_methods} - - obj = umap_hdbscan_grid_sweep(inpath, - outpath, - umap_args, - hdbscan_args) - obj.run(cores=10) - obj.save(savefile) - - -def KNN_distances_plot(mat,outname,k=2): - nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat) - distances, indices = nbrs.kneighbors(mat) - d2 = distances[:,-1] - df = pd.DataFrame({'dist':d2}) - df = df.sort_values("dist",ascending=False) - df['idx'] = np.arange(0,d2.shape[0]) + 1 - p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50, - breaks = np.arange(0,10)/10) - p.save(outname,width=16,height=10) - -def make_KNN_plots(): - similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather" - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - - KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png') - - similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather" - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png') - - similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather" - subreddits, mat = read_similarity_mat(similarities) - mat = sim_to_dist(mat) - KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png') - -if __name__ == "__main__": - fire.Fire(run_umap_hdbscan_grid_sweep) - -# test_select_hdbscan_clustering() - #fire.Fire(select_hdbscan_clustering) diff --git a/clustering/umap_hdbscan_clustering_lsi.py b/clustering/umap_hdbscan_clustering_lsi.py deleted file mode 100644 index 3149939..0000000 --- a/clustering/umap_hdbscan_clustering_lsi.py +++ /dev/null @@ -1,113 +0,0 @@ -from umap_hdbscan_clustering import umap_hdbscan_job, umap_hdbscan_grid_sweep, umap_hdbscan_clustering_result -from lsi_base import twoway_lsi_grid_sweep, lsi_mixin, lsi_result_mixin -from grid_sweep import twoway_grid_sweep -import fire -from dataclasses import dataclass - -@dataclass -class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_result_mixin): - pass - -class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin): - def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims): - super().__init__( - infile, - outpath, - name, - umap_args, - hdbscan_args - ) - super().set_lsi_dims(lsi_dims) - - def get_info(self): - partial_result = super().get_info() - self.result = umap_hdbscan_clustering_result_lsi(**partial_result.__dict__, - lsi_dimensions=self.lsi_dims) - return self.result - -class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep): - def __init__(self, - inpath, - lsi_dims, - outpath, - umap_args, - hdbscan_args - ): - - super().__init__(umap_hdbscan_lsi_job, - _umap_hdbscan_lsi_grid_sweep, - inpath, - lsi_dims, - outpath, - umap_args, - hdbscan_args - ) - - - -class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep): - def __init__(self, - inpath, - outpath, - lsi_dim, - umap_args, - hdbscan_args, - ): - - self.lsi_dim = lsi_dim - self.jobtype = umap_hdbscan_lsi_job - super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, lsi_dim) - - - def namer(self, *args, **kwargs): - s = umap_hdbscan_grid_sweep.namer(self, *args, **kwargs) - s += f"_lsi-{self.lsi_dim}" - return s - -def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], - densmap=[False], - min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all'): - """Run hdbscan clustering once or more with different parameters. - - Usage: - hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes= --min_samples= --cluster_selection_epsilons= --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. - - Keword arguments: - savefile: path to save the metadata and diagnostics - inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities. - outpath: path to output fit clusterings. - min_cluster_sizes: one or more integers indicating the minumum cluster size - min_samples: one ore more integers indicating the minimum number of samples used in the algorithm - cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan - cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. - lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH. - """ - - - umap_args = {'n_neighbors':list(map(int, n_neighbors)), - 'learning_rate':list(map(float,learning_rate)), - 'min_dist':list(map(float,min_dist)), - 'local_connectivity':list(map(int,local_connectivity)), - 'n_components':list(map(int, n_components)), - 'densmap':list(map(bool,densmap)) - } - - hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)), - 'min_samples':list(map(int,min_samples)), - 'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)), - 'cluster_selection_method':cluster_selection_methods} - - obj = umap_hdbscan_lsi_grid_sweep(inpath, - lsi_dimensions, - outpath, - umap_args, - hdbscan_args - ) - - - obj.run(10) - obj.save(savefile) - - -if __name__ == "__main__": - fire.Fire(run_umap_hdbscan_lsi_grid_sweep) diff --git a/clustering/validation.py b/clustering/validation.py new file mode 100644 index 0000000..c56b7b2 --- /dev/null +++ b/clustering/validation.py @@ -0,0 +1,4 @@ +from sklearn import metrics +from sklearn.cluster import AffinityPropagation +from functools import partial +# sillouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. diff --git a/datasets/Makefile b/datasets/Makefile new file mode 100644 index 0000000..c64c56b --- /dev/null +++ b/datasets/Makefile @@ -0,0 +1,28 @@ +all: ../../data/reddit_comments_by_subreddit.parquet ../../data/reddit_submissions_by_subreddit.parquet + +../../data/reddit_comments_by_subreddit.parquet:../../data/temp/reddit_comments.parquet + ../start_spark_and_run.sh 4 comments_2_parquet_part2.py + +../../data/temp/reddit_comments.parquet: comments_task_list.sh run_comments_jobs.sbatch + mkdir -p comments_jobs + mkdir -p ../../data/temp/ + sbatch --wait --array=1-$(shell cat comments_task_list.sh | wc -l) run_comments_jobs.sbatch 0 + +temp_reddit_comments.parquet: ../../data/temp/reddit_comments.parquet + +comments_task_list.sh: comments_2_parquet_part1.py + srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 bash -c "source ~/.bashrc && python3 comments_2_parquet_part1.py gen_task_list --overwrite=False" + +submissions_task_list.sh: submissions_2_parquet_part1.py + srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 submissions_2_parquet_part1.py gen_task_list + +../../data/reddit_submissions_by_subreddit.parquet:../../data/temp/reddit_submissions.parquet + ../start_spark_and_run.sh 4 submissions_2_parquet_part2.py + +../../data/temp/reddit_submissions.parquet: submissions_task_list.sh run_submissions_jobs.sbatch + mkdir -p submissions_jobs + rm -rf ../../data/temp/reddit_submissions.parquet + mkdir -p ../../data/temp/ + sbatch --wait --array=1-$(shell cat submissions_task_list.sh | wc -l) run_submissions_jobs.sbatch 0 + +temp_reddit_submissions.parquet: ../../data/temp/reddit_submissions.parquet diff --git a/datasets/comments_2_parquet_part1.py b/datasets/comments_2_parquet_part1.py index 6960986..7e06833 100755 --- a/datasets/comments_2_parquet_part1.py +++ b/datasets/comments_2_parquet_part1.py @@ -47,11 +47,11 @@ def parse_comment(comment, names= None): return tuple(row) -# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')]) +# conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','../../data/spark_tmp')]) def parse_dump(partition): - dumpdir = f"/gscratch/comdata/raw_data/reddit_dumps/comments/{partition}" + dumpdir = f"../../data/reddit_dumps/comments/{partition}" stream = open_input_file(dumpdir) rows = map(parse_comment, stream) @@ -76,11 +76,11 @@ def parse_dump(partition): pa.field('error', pa.string(), nullable=True), ]) - p = Path("/gscratch/comdata/output/temp/reddit_comments.parquet") + p = Path("../../data/temp/reddit_comments.parquet") p.mkdir(exist_ok=True,parents=True) N=10000 - with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet", + with pq.ParquetWriter(f"../../data/temp/reddit_comments.parquet/{partition}.parquet", schema=schema, compression='snappy', flavor='spark') as writer: @@ -96,12 +96,12 @@ def parse_dump(partition): writer.close() -def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/comments", overwrite=True): +def gen_task_list(dumpdir="../../data/raw_data/reddit_dumps/comments", overwrite=True): files = list(find_dumps(dumpdir,base_pattern="RC_20*.*")) with open("comments_task_list.sh",'w') as of: for fpath in files: partition = os.path.split(fpath)[1] - if (not Path(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True): + if (not Path(f"../../data/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True): of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n') diff --git a/datasets/job_script.sh b/datasets/job_script.sh deleted file mode 100755 index ca994d5..0000000 --- a/datasets/job_script.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/bash -source ~/.bashrc -echo $(hostname) -start_spark_cluster.sh -spark-submit --verbose --master spark://$(hostname):43015 submissions_2_parquet_part2.py -stop-all.sh diff --git a/datasets/run_comments_jobs.sbatch b/datasets/run_comments_jobs.sbatch new file mode 100644 index 0000000..ce5f3e4 --- /dev/null +++ b/datasets/run_comments_jobs.sbatch @@ -0,0 +1,24 @@ +#!/bin/bash +## tf reddit comments +#SBATCH --job-name="cdsc_reddit; parse comment dumps" +## Allocation Definition +#SBATCH --account=comdata +#SBATCH --partition=compute-bigmem +## Resources +## Nodes. This should always be 1 for parallel-sql. +#SBATCH --nodes=1 +## Walltime (12 hours) +#SBATCH --time=24:00:00 +## Memory per node +#SBATCH --mem=8G +#SBATCH --cpus-per-task=1 +#SBATCH --ntasks=1 +#SBATCH +#SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/datasets +#SBATCH --output=comments_jobs/%A_%a.out +#SBATCH --error=comments_jobs/%A_%a.out +. /opt/ohpc/admin/lmod/lmod/init/profile +source ~/.bashrc +TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1)) +TASK_CALL=$(sed -n ${TASK_NUM}p ./comments_task_list.sh) +${TASK_CALL} diff --git a/datasets/run_submissions_jobs.sbatch b/datasets/run_submissions_jobs.sbatch new file mode 100644 index 0000000..9f63e83 --- /dev/null +++ b/datasets/run_submissions_jobs.sbatch @@ -0,0 +1,23 @@ +#!/bin/bash +## tf reddit comments +#SBATCH --job-name="cdsc_reddit; parse submission dumps" +## Allocation Definition +#SBATCH --account=comdata-ckpt +#SBATCH --partition=ckpt +## Resources +## Nodes. This should always be 1 for parallel-sql. +#SBATCH --nodes=1 +## Walltime (12 hours) +#SBATCH --time=24:00:00 +## Memory per node +#SBATCH --mem=8G +#SBATCH --cpus-per-task=1 +#SBATCH --ntasks=1 +#SBATCH +#SBATCH --chdir /gscratch/comdata/users/nathante/cdsc_reddit/datasets +#SBATCH --output=submissions_jobs/%A_%a.out +#SBATCH --error=submissions_jobs/%A_%a.out + +TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1)) +TASK_CALL=$(sed -n ${TASK_NUM}p ./submissions_task_list.sh) +${TASK_CALL} diff --git a/density/Makefile b/density/Makefile index 90eba82..2d06de0 100644 --- a/density/Makefile +++ b/density/Makefile @@ -1,16 +1,7 @@ -all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather +all: ../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather -/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather - start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum +../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py ../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather + ../start_spark_and_run.sh 1 overlap_density.py authors --inpath="../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum -/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather - start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum - -/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet - start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum - -/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather - start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum - -/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather - start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum +../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather: + $(MAKE) -C ../similarities diff --git a/density/job_script.sh b/density/job_script.sh index e411ba7..71cd969 100755 --- a/density/job_script.sh +++ b/density/job_script.sh @@ -1,4 +1,6 @@ #!/usr/bin/bash +source ~/.bashrc +echo $(hostname) start_spark_cluster.sh -singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum -singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh +spark-submit --verbose --master spark://$(hostname):43015 overlap_density.py authors --inpath=../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum +stop-all.sh diff --git a/dumps/remove_duplicate_comments.py b/dumps/remove_duplicate_comments.py new file mode 100644 index 0000000..e639586 --- /dev/null +++ b/dumps/remove_duplicate_comments.py @@ -0,0 +1,34 @@ +from pathlib import Path +from itertools import chain, groupby + +dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments") + +zst_files = dumpdir.glob("*.zst") +bz2_files = dumpdir.glob("*.bz2") +xz_files = dumpdir.glob("*.xz") +all_files = sorted(list(chain(zst_files, bz2_files, xz_files))) +groups = groupby(all_files, key = lambda p: p.stem) + +kept_paths = [] +removed_paths = [] + +priority = ['.zst','.xz','.bz2'] + +for stem, files in groups: + keep_file = None + remove_files = [] + for f in files: + if keep_file is None: + keep_file = f + elif priority.index(keep_file.suffix) > priority.index(f.suffix): + remove_files.append(keep_file) + keep_file = f + else: + remove_files.append(f) + kept_paths.append(keep_file) + removed_paths.extend(remove_files) + +(dumpdir / "to_remove").mkdir() + +for f in removed_paths: + f.rename(f.parent / "to_remove" / f.name) diff --git a/dumps/remove_duplicate_submissions.py b/dumps/remove_duplicate_submissions.py new file mode 100644 index 0000000..8e89fe9 --- /dev/null +++ b/dumps/remove_duplicate_submissions.py @@ -0,0 +1,34 @@ +from pathlib import Path +from itertools import chain, groupby + +dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/submissions") + +zst_files = dumpdir.glob("*.zst") +bz2_files = dumpdir.glob("*.bz2") +xz_files = dumpdir.glob("*.xz") +all_files = sorted(list(chain(zst_files, bz2_files, xz_files))) +groups = groupby(all_files, key = lambda p: p.stem) + +kept_paths = [] +removed_paths = [] + +priority = ['.zst','.xz','.bz2'] + +for stem, files in groups: + keep_file = None + remove_files = [] + for f in files: + if keep_file is None: + keep_file = f + elif priority.index(keep_file.suffix) > priority.index(f.suffix): + remove_files.append(keep_file) + keep_file = f + else: + remove_files.append(f) + kept_paths.append(keep_file) + removed_paths.extend(remove_files) + +(dumpdir / "to_remove").mkdir() + +for f in removed_paths: + f.rename(f.parent / "to_remove" / f.name) diff --git a/examples/pyarrow_reading.py b/examples/pyarrow_reading.py deleted file mode 100644 index 59f9fd9..0000000 --- a/examples/pyarrow_reading.py +++ /dev/null @@ -1,17 +0,0 @@ -import pyarrow.dataset as ds - -# A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory. -#dataset = ds.dataset(pathlib.Path('/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet/'), format='parquet', partitioning='hive') -dataset = ds.dataset('/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/', format='parquet') - -# let's get all the comments to two subreddits: -subreddits_to_pull = ['seattle','seattlewa'] - -# a table is a low-level structured data format. This line pulls data into memory. Setting metadata_n_threads > 1 gives a little speed boost. -table = dataset.to_table(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext']) - -# Since data from just these 2 subreddits fits in memory we can just turn our table into a pandas dataframe. -df = table.to_pandas() - -# We should save this smaller dataset so we don't have to wait 15 min to pull from parquet next time. -df.to_csv("mydataset.csv") diff --git a/examples/pyarrow_streaming.py b/examples/pyarrow_streaming.py deleted file mode 100644 index ebe2219..0000000 --- a/examples/pyarrow_streaming.py +++ /dev/null @@ -1,38 +0,0 @@ -import pyarrow.dataset as ds -from itertools import groupby - -# A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory. - -dataset = ds.dataset('/gscratch/comdata/output/reddit_submissions_by_author.parquet', format='parquet') - -# let's get all the comments to two subreddits: -subreddits_to_pull = ['seattlewa','seattle'] - -# instead of loading the data into a pandas dataframe all at once we can stream it. -scan_tasks = dataset.scan(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext']) - -# simple function to execute scantasks and generate rows -def iterate_rows(scan_tasks): - for st in scan_tasks: - for rb in st.execute(): - df = rb.to_pandas() - for t in df.itertuples(): - yield t - -row_iter = iterate_rows(scan_tasks) - -# now we can use python's groupby function to read one author at a time -# note that the same author can appear more than once since the record batches may not be in the correct order. -author_submissions = groupby(row_iter, lambda row: row.author) - -count_dict = {} - -for auth, posts in author_submissions: - if auth in count_dict: - count_dict[auth] = count_dict[auth] + 1 - else: - count_dict[auth] = 1 - -# since it's partitioned and sorted by author, we get one group for each author -any([ v != 1 for k,v in count_dict.items()]) - diff --git a/ngrams/#ngrams_helper.py# b/ngrams/#ngrams_helper.py# deleted file mode 100644 index e69de29..0000000 diff --git a/ngrams/Makefile b/ngrams/Makefile new file mode 100644 index 0000000..e9a2770 --- /dev/null +++ b/ngrams/Makefile @@ -0,0 +1,25 @@ +outputdir=../../data/reddit_ngrams/ +inputdir=../../data/reddit_comments_by_subreddit.parquet +authors_tfdir=${outputdir}/comment_authors.parquet +srun=sbatch --wait --verbose run_job.sbatch + +all: ${outputdir}/comment_authors_sorted.parquet/_SUCCESS + +tf_task_list_1: tf_comments.py + ${srun} bash -c "python3 tf_comments.py gen_task_list --mwe_pass='first' --outputdir=${outputdir} --tf_task_list=$@ --inputdir=${inputdir}" + +${outputdir}/comment_terms.parquet:tf_task_list_1 + mkdir -p sbatch_log + sbatch --wait --verbose --array=1-$(shell cat $< | wc -l) run_array.sbatch 0 $< + +${outputdir}/comment_authors.parquet:${outputdir}/comment_terms.parquet + - + +${outputdir}/comment_authors_sorted.parquet:${outputdir}/comment_authors.parquet sort_tf_comments.py + ../start_spark_and_run.sh 3 sort_tf_comments.py --inparquet=$< --outparquet=$@ --colname=author + +${outputdir}/comment_authors_sorted.parquet/_SUCCESS:${outputdir}/comment_authors_sorted.parquet + + +${inputdir}: + $(MAKE) -C ../datasets diff --git a/ngrams/run_array.sbatch b/ngrams/run_array.sbatch new file mode 100755 index 0000000..12bce17 --- /dev/null +++ b/ngrams/run_array.sbatch @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name=reddit_comment_term_frequencies +#SBATCH --account=comdata +#SBATCH --partition=compute-bigmem +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --mem-per-cpu=9g +#SBATCH --ntasks=1 +#SBATCH --export=ALL +#SBATCH --time=48:00:00 +#SBATCH --chdir=/gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams +#SBATCH --error="sbatch_log/%A_%a.out" +#SBATCH --output="sbatch_log/%A_%a.out" + +TASK_NUM=$(($SLURM_ARRAY_TASK_ID + $1)) +TASK_CALL=$(sed -n ${TASK_NUM}p $2) +${TASK_CALL} + diff --git a/ngrams/run_job.sbatch b/ngrams/run_job.sbatch new file mode 100644 index 0000000..4f347e3 --- /dev/null +++ b/ngrams/run_job.sbatch @@ -0,0 +1,18 @@ +#!/bin/bash +#SBATCH --job-name="simulate measurement error models" +## Allocation Definition +#SBATCH --account=comdata +#SBATCH --partition=compute-bigmem +## Resources +#SBATCH --nodes=1 +## Walltime (4 hours) +#SBATCH --time=4:00:00 +## Memory per node +#SBATCH --mem=4G +#SBATCH --cpus-per-task=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams/ +#SBATCH --output=sbatch_log/%A_%a.out +#SBATCH --error=sbatch_log/%A_%a.err +echo "$@" +"$@" diff --git a/ngrams/tf_comments.py b/ngrams/tf_comments.py index f472eeb..604421c 100755 --- a/ngrams/tf_comments.py +++ b/ngrams/tf_comments.py @@ -3,6 +3,7 @@ import pandas as pd import pyarrow as pa import pyarrow.dataset as ds import pyarrow.parquet as pq +import pyarrow.compute as pc from itertools import groupby, islice, chain import fire from collections import Counter @@ -15,11 +16,12 @@ import string from random import random from redditcleaner import clean from pathlib import Path +from datetime import datetime # compute term frequencies for comments in each subreddit by week -def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', input_dir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None): +def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None): - dataset = ds.dataset(Path(input_dir)/partition, format='parquet') + dataset = ds.dataset(Path(inputdir)/partition, format='parquet') outputdir = Path(outputdir) samppath = outputdir / "reddit_comment_ngrams_10p_sample" @@ -37,7 +39,8 @@ def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', if mwe_pass == 'first': if ngram_path.exists(): ngram_path.unlink() - + + dataset = dataset.filter(pc.field("CreatedAt") <= pa.scalar(datetime(2020,4,13))) batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author']) @@ -160,9 +163,9 @@ def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', outchunksize = 10000 - termtf_outputdir = (outputdir / "comment_terms") + termtf_outputdir = (outputdir / "comment_terms.parquet") termtf_outputdir.mkdir(parents=True, exist_ok=True) - authortf_outputdir = (outputdir / "comment_authors") + authortf_outputdir = (outputdir / "comment_authors.parquet") authortf_outputdir.mkdir(parents=True, exist_ok=True) termtf_path = termtf_outputdir / partition authortf_path = authortf_outputdir / partition @@ -196,12 +199,12 @@ def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', author_writer.close() -def gen_task_list(mwe_pass='first', outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None): - files = os.listdir("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/") +def gen_task_list(mwe_pass='first', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None): + files = os.listdir(inputdir) with open(tf_task_list,'w') as outfile: for f in files: if f.endswith(".parquet"): - outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n") + outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --inputdir {inputdir} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n") if __name__ == "__main__": fire.Fire({"gen_task_list":gen_task_list, diff --git a/ngrams/top_comment_phrases.py b/ngrams/top_comment_phrases.py deleted file mode 100755 index ff1c4f0..0000000 --- a/ngrams/top_comment_phrases.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python3 -from pyspark.sql import functions as f -from pyspark.sql import Window -from pyspark.sql import SparkSession -import numpy as np -import fire -from pathlib import Path - - -def main(ngram_dir="/gscratch/comdata/output/reddit_ngrams"): - spark = SparkSession.builder.getOrCreate() - ngram_dir = Path(ngram_dir) - ngram_sample = ngram_dir / "reddit_comment_ngrams_10p_sample" - df = spark.read.text(str(ngram_sample)) - - df = df.withColumnRenamed("value","phrase") - - # count phrase occurrances - phrases = df.groupby('phrase').count() - phrases = phrases.withColumnRenamed('count','phraseCount') - phrases = phrases.filter(phrases.phraseCount > 10) - - # count overall - N = phrases.select(f.sum(phrases.phraseCount).alias("phraseCount")).collect()[0].phraseCount - - print(f'analyzing PMI on a sample of {N} phrases') - logN = np.log(N) - phrases = phrases.withColumn("phraseLogProb", f.log(f.col("phraseCount")) - logN) - - # count term occurrances - phrases = phrases.withColumn('terms',f.split(f.col('phrase'),' ')) - terms = phrases.select(['phrase','phraseCount','phraseLogProb',f.explode(phrases.terms).alias('term')]) - - win = Window.partitionBy('term') - terms = terms.withColumn('termCount',f.sum('phraseCount').over(win)) - terms = terms.withColumnRenamed('count','termCount') - terms = terms.withColumn('termLogProb',f.log(f.col('termCount')) - logN) - - terms = terms.groupBy(terms.phrase, terms.phraseLogProb, terms.phraseCount).sum('termLogProb') - terms = terms.withColumnRenamed('sum(termLogProb)','termsLogProb') - terms = terms.withColumn("phrasePWMI", f.col('phraseLogProb') - f.col('termsLogProb')) - - # join phrases to term counts - - - df = terms.select(['phrase','phraseCount','phraseLogProb','phrasePWMI']) - - df = df.sort(['phrasePWMI'],descending=True) - df = df.sortWithinPartitions(['phrasePWMI'],descending=True) - - pwmi_dir = ngram_dir / "reddit_comment_ngrams_pwmi.parquet/" - df.write.parquet(str(pwmi_dir), mode='overwrite', compression='snappy') - - df = spark.read.parquet(str(pwmi_dir)) - - df.write.csv(str(ngram_dir / "reddit_comment_ngrams_pwmi.csv/"),mode='overwrite',compression='none') - - df = spark.read.parquet(str(pwmi_dir)) - df = df.select('phrase','phraseCount','phraseLogProb','phrasePWMI') - - # choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions. - # - df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3) - df = df.toPandas() - df.to_feather(ngram_dir / "multiword_expressions.feather") - df.to_csv(ngram_dir / "multiword_expressions.csv") - -if __name__ == '__main__': - fire.Fire(main) diff --git a/run_array.sbatch b/run_array.sbatch new file mode 100644 index 0000000..2228c75 --- /dev/null +++ b/run_array.sbatch @@ -0,0 +1,22 @@ +#!/bin/bash +## tf reddit comments +#SBATCH --job-name="wikia ecology; fit var models" +## Allocation Definition +#SBATCH --account=comdata-ckpt +#SBATCH --partition=ckpt +## Resources +## Nodes. This should always be 1 for parallel-sql. +#SBATCH --nodes=1 +## Walltime (12 hours) +#SBATCH --time=24:00:00 +## Memory per node +#SBATCH --mem=8G +#SBATCH --cpus-per-task=1 +#SBATCH --ntasks=1 +#SBATCH +#SBATCH --chdir /gscratch/comdata/users/nathante/wikia_ecology +#SBATCH --output=var_jobs/%A_%a.out +#SBATCH --error=var_jobs/%A_%a.out +TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1)) +TASK_CALL=$(sed -n ${TASK_NUM}p ./var_jobs.sh) +${TASK_CALL} diff --git a/similarities/Makefile b/similarities/Makefile index 963192d..3d508d9 100644 --- a/similarities/Makefile +++ b/similarities/Makefile @@ -1,138 +1,28 @@ +srun=srun -p compute-bigmem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40 +srun_huge=srun -p compute-hugemem -A comdata --mem=724g --time=200:00:00 -c 40 -#all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet -# srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh -# srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh -srun=srun -p compute-bigmem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40 -srun_huge=srun -p compute-hugemem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40 -similarity_data=/gscratch/scrubbed/comdata/reddit_similarity +similarity_data=../../data/reddit_similarity tfidf_data=${similarity_data}/tfidf -tfidf_weekly_data=${similarity_data}/tfidf_weekly -similarity_weekly_data=${similarity_data}/weekly -lsi_components=[10,50,100,200,300,400,500,600,700,850,1000,1500] +lsi_components=[10,50,100,200,300,400,500,600,700,850] -lsi_similarities: ${similarity_data}/subreddit_comment_terms_10k_LSI ${similarity_data}/subreddit_comment_authors-tf_10k_LSI ${similarity_data}/subreddit_comment_authors_10k_LSI ${similarity_data}/subreddit_comment_terms_30k_LSI ${similarity_data}/subreddit_comment_authors-tf_30k_LSI ${similarity_data}/subreddit_comment_authors_30k_LSI +lsi_similarities: ${similarity_data}/subreddit_comment_authors-tf_10k_LSI +all: ${similarity_data}/subreddit_comment_authors-tf_10k.feather -all: ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather +${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py ${similarity_data}/subreddits_by_num_comments_nonsfw.csv + ${srun_huge} /bin/bash -c "source ~/.bashrc; python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$<" -#all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet +${similarity_data}/subreddits_by_num_comments_nonsfw.csv: ../../data/reddit_submissions_by_subreddit.parquet ../../data/reddit_comments_by_subreddit.parquet + ../start_spark_and_run.sh 3 top_subreddits_by_comments.py -#${tfidf_weekly_data}/comment_terms_100k.parquet ${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet ${similarity_weekly_data}/comment_authors_100k.parquet ${similarity_weekly_data}/comment_terms_30k.parquet ${similarity_weekly_data}/comment_authors_30k.parquet +${tfidf_data}/comment_authors_100k.parquet: ../../data/reddit_ngrams/comment_authors_sorted.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv + ../start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet -# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_weekly_130k.parquet +../../data/reddit_ngrams/comment_authors_sorted.parquet: + $(MAKE) -C ../ngrams -# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet +../../data/reddit_submissions_by_subreddit.parquet: + $(MAKE) -C ../datasets -${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_terms.parquet - ${srun} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet - -${similarity_data}/subreddit_comment_terms_10k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000 - -${similarity_data}/subreddit_comment_terms_10k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun_huge} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200 - -${similarity_data}/subreddit_comment_terms_30k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun_huge} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200 --inpath=$< - -${similarity_data}/subreddit_comment_terms_30k.feather: ${tfidf_data}/comment_terms_30k.parquet similarities_helper.py - ${srun_huge} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000 --inpath=$< - -${similarity_data}/subreddit_comment_authors_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py - ${srun_huge} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000 --inpath=$< - -${similarity_data}/subreddit_comment_authors_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py - ${srun_huge} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000 --inpath=$< - -${similarity_data}/subreddit_comment_authors_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_huge} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$< - -${similarity_data}/subreddit_comment_authors_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_huge} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=10 --inpath=$< - -${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000 --inpath=$< - -${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000 - -${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_huge} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$< - -${similarity_data}/subreddit_comment_authors-tf_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun_huge} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=10 --inpath=$< - -${similarity_data}/subreddit_comment_terms_100k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py - ${srun} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000 - -${similarity_data}/subreddit_comment_authors_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000 - -${similarity_data}/subreddit_comment_authors-tf_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py - ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000 - -${similarity_data}/subreddits_by_num_comments_nonsfw.csv: - start_spark_and_run.sh 3 top_subreddits_by_comments.py - -${tfidf_data}/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv -# mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 3 tfidf.py terms --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_terms_100k.parquet - -${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv -# mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 3 tfidf.py terms --topN=30000 --inpath=$< --outpath=${tfidf_data}/comment_terms_30k.feather - -${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv -# mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 3 tfidf.py terms --topN=10000 --inpath=$< --outpath=${tfidf_data}/comment_terms_10k.feather - -${tfidf_data}/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv -# mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet - -${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv -# mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 3 tfidf.py authors --topN=10000 --inpath=$< --outpath=${tfidf_data}/comment_authors_10k.parquet - -${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv -# mkdir -p ${tfidf_data}/ - start_spark_and_run.sh 3 tfidf.py authors --topN=30000 --inpath=$< --outpath=${tfidf_data}/comment_authors_30k.parquet - -${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv - start_spark_and_run.sh 3 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet - -${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_ppnum_comments.csv - start_spark_and_run.sh 3 tfidf.py authors_weekly --topN=100000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet - -${tfidf_weekly_data}/comment_terms_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv - start_spark_and_run.sh 2 tfidf.py terms_weekly --topN=30000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet - -${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv - start_spark_and_run.sh 3 tfidf.py authors_weekly --topN=30000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet - -${similarity_weekly_data}/comment_terms_100k.parquet: weekly_cosine_similarities.py similarities_helper.py ${tfidf_weekly_data}/comment_terms_100k.parquet - ${srun} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_terms_100k.parquet - -${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_authors_100k.parquet - ${srun} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet - -${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_terms_30k.parquet - ${srun} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet - -,${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_authors_30k.parquet - ${srun} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet - -# ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv -# start_spark_and_run.sh 1 tfidf.py authors_weekly --topN=130000 - -# /gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet -# start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather - -# /gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet -# start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather - -# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py ${tfidf_weekly_data}/comment_authors.parquet -# start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet - -# /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet -# start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet +../../data/reddit_comments_by_subreddit.parquet: + $(MAKE) -C ../datasets diff --git a/similarities/__pycache__/similarities_helper.cpython-37.pyc b/similarities/__pycache__/similarities_helper.cpython-37.pyc deleted file mode 100644 index eb607f33b4ceca27baf1f8cfd31b66ae8550b102..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10402 zcmc&)$#W#xSGfZSyGj1 zo0q9kOHN^e9&EET7;L}`h7!wxF#B#eb0hcz-iZiK0}-4!`NZ${GF7Egt37idf>Pv{ zFW;VDe!p*fdaF>#8u-2X#s6v=Zy3gZ@Zsd=q3|hO!7bA;gdxnnF)$mZ$+CsA)v$Eg zM%ivSy6p5*gLEUU%c;IQ@EV>jr~8>fwvp9kx1Vd|Fo)aE4+@Q^yegK(3QA9k*Tkw=L+L58E-rpxG-~1r@g(j~i>E{l z_e*95>AvRHZ zUbuIR`X@dlZ|kPY-x;^$?j1h}x}zbU(?8uEiqSqwna%(lKYZZ$Q&&;&jfN?Vh9%5~Ev$wkY;4_8nXuE`_q*FW;fL)0p#`#4Pbsh2 z9JU62v#GMp=3pfD`Yh+0&G+_N{iG%JcI*D#+bX>^l7nln|A`^FjwauKcXK;v%U0Ol z*=&yn*kx;TFYslsiMQQp4MTskErNDa`a*QW&7eEz_FJ+WcKu-Pp}FJt$G+ScAILoB zF5?Ou5YKc>U32)(qYI7p!tq|X8PDCE&V8^McDjOZ>+kjtxUGRd44bXJaAzcg4Y2gR zJwN>Lf53*@?F<&-uYxO$>n5(?Ni;>~Q6|a=bJ;k`;%?!d3yq#Rv5t+Se3b9m69=V2 zn2OR5fGX6aCvIdMn@7dS6~=Bx_ChamADCfg*OQGfJIO^ZTHUBP$w$RrA#!(9y0@o$ ze*?X}r1w{JZx5~Bw70h#&t04pg)N*2yV`b+%}FUNPZk7bnp8+JrW7tlrN|7cy$cU) zJT37lvLZWj9;A-SQGT)lhm@U0+Vf@;$F%`y*YyH+z*wDCX_oWHJ06Bs2p^{s>w%GPIxf(l^ujqQWL5| zM~(*FVAO%6wnCNebo-$%<(gh@TaNa|ZyhMRwY{y(4uA>#;#lUGd2)j@L$_q#DwuXXJ2O}9a-*5Hz z{GeV@>FyA_!h2$4gJyfAL8j7!?yxC3xVP@}zKBWsd%_pZDGLL99B-7e9fN*i%RJyjvI_cSE|x=?@@HN8P<{UWghkX)O?)O_;&qqMA=m&h0AkPdtgM;EJ#XVOvsT&Cce89gV;0Ra z>Xy)6Hsx)MXuBMTAGN_0m;Z`~DF)rhO)%(9G3ZV)n28*q%9GcDs?-DX$QkCtOqc}% zOg=?vVeVS86J;j3Fh40ozzuK(6qJavOqdcjQ5}||%qi?-u_jTT?UmUJqZfoDQW`xK zpl1>2$wj$Il}Kbsq=23@(1Q}t<4!Id8_*TmD5p{6ZrxR8^Ki3G(eDnoYvjAO3bjPO zYWv-Arxsg;8d%+JLj>h27$Gl`Xgn=xJjo|m@g&JpBsG$!NiLCG22o3~oHi*)rzOh{ zd|+!d_J{I0c7C3nZ3w6GASGd|I}9X0rQ9UBM)ESrD!;^%WI(#8Bmi>1eFdIZsF5sh5az|GuBc5 zs4!d$&G~lWJapr>&=L?bh>Hy~;9!m%6mJo=v1u@)B80B=b43Vh>47PK0ctl5Q7NH^ z817|7)7l&iMhOB1b=@n7jL3dqgvFCyw~PlS9jJ#E+V#k& zBnmeux)9-`atH$vl^>W8)#9WA#lLuLiqbLGJ6brmt*qNB=e8~AHkg-5HL{Pj*^5P5 zm@o>J!>?y+_{AzEQEk_2kgZxcsy!$DKBWA8AIjo6R=`oH4XLR6-9ukM_T`|~kv^>R zuzdhw4?-x~+Ngv1Uc!fbv9`a{ZSU0Dtzqq+&(WQ|ehr$w7Mu2$?m-)h+SCsKP&V{- zwrys0D2+V|j>`Sgb+lb~RwEgg*>?mD0RYHdv%pn@Y#* zP}W$e%q=*Tx23kS(%05DNHx-)j5H>5aNWbY+ek#n?$b z*+3(kJcDo3wuF0Uz8M^$Q{M7Bc*THP*>K8c&0e!S=sOry(}M2vtg_a9H4{3~x@h7L z##U~lcbhWAA32`FrL`Yr01mDLBS{B0g*zSGG>vOy(7l8HtVW1N-TXGqz$rX~zAc$bnv3nL2y--l$NVyt~E$zWGcSI;PC4B?DX z%(-QoF)GmJB=Km9<|R|UhuSvb${#l}W%CXm zz-AjvB%2-F$>!7?o6|HNk#(Z%qjb-XU^t@Gwhcy`n$a#8?M4J(kIn*k9<8+%v*8!Q>gm_uAm5# zxPox_XsvbIR6@zNDeGv#spW(TOJ+J!sMo zP7QW(Yd8bHAI9F-=dnSpT22Ci%rxw%r($0D=n{t>DXOhN5_V_UqgMA34*Q9K&4O4M zuU|y(MB3o!5bQQswvXDWks!MW@SE&P=>~YsU3-EI57-4p0M2IFfWa_(U=!E_X$R;MrJq~HV=I{3RGT)0Re&{WJ4OMc1wzvk&u9fpO+6+ihq@sxb~f^G=tZ-zcqzAw3DL7O9a0Bz1d#eiaB!s2st zZT2pc3E))HA3Y~P;PFW=V1kxRoJgo-)Ec7rCXnDyf zOV3JEp% zld0^T(Y~bBSJ^m@X^vp>Qd+`h9+#@nYo1R~D02h8U^@1HR2mr!_(9618p{x5TH&C5 z*cy)yV0j&6{d|?9b10LP=DPeGR-T1|@-y6s(+#@AS@5T_j7@K9v#qjnwBLkRb`R+s zhpR}!ROO%@L!|pgL*!jtWI=uT)`4tt@~b3uk}s3I2%;8dGk#XPddOUyiW-!SFaUWA zC?iP?z{cqUDy3gc`xx5EL@-Kloa@iey93I_hruTabU`R#zWQ2_@S5 z_tASU;lTt)6^grRE!h>g3T3-yR^yVDH`graH>+yOKgPK8aeepy3D-d34Ab-2IzbiD zoF|$S6csdf5%W$;x>OcOUji>gabBYsaqA4h+i25koP*yvADTkiQH9v+ye{ z6O-~+NgjgCpeTQhb+lg4_g`iCH%NYy_XCupbtTggNo)p9;Fo;)2MiqvTt81Ux z!>ougVA9SS8l9$xUjVD?&qW6RiQNQb)G44@ONb911zFXpTFpZ;*&!kEaQ=FlNS{0^V8dwSpF=K65BnVMY*?cd%@?=Hutyg0Q7-L}A=69t*54ZBsG29{kEc74;vhy534Noh;wS*^pgVQy5f7BrI zk9nW;vct9B`mXg=Q~n!Gu_&U~MO;r*H5Z2{vlhk>!0uhF8q55iNo&Mt?*!-?+B#LT z4v9E)aRhZW?9_3j$Fpdj?Z6+kMni2>RAuVK1~}@`u1{=en6rsZsJ5Q8ld~A=$=I3H zOKA&tmA$q5_dH84kZ2#!)yH)Aa7>4_V`HXGoi>Erp!`!3?KDo61#*X@ZTaVH_zcMz zgZXXNX{(uMiHE&23t7)0&xC`pdJ#Cjg8L<1UqPv2 zR~?+3Sr;w&J-pR455)kQ*G*hOCpM(BJQT1kER7)K1b|WmjtHF*Dnmw7kP@2#q(~w^jM#sA{0hxYV3e(Ad)DgL8bX~fqDnQpg!{!3i8@Q5~z8X9HHuh+v08k-Jqe5&shVRScb9PCVYk0-2>ybE({Cr{KyHV%5= zEC6}&yISEwuc9$^8*4hJ14D4e(3nE4C0_<`z~kE@#W3*vxCGwN?R z$$8wGXNp>kp+1N4FOW&pL?=wD;TTi?6Q0lef13Ej;GBPRndz72y#of{@D=p`x&%$+ z;e!1RPiNXZ>fz5xF3nhKU!7JZldKR)Fkr#L(WSAAlxqi><{nSYp)p`5tgSd_nwAX= zYj`?6j;ZBO;*pxS-6%{~maXBg{5%@!PJHA$;+Qn6MJGN9p68JK33T`pIl@oR!`(Gg z*C8plwIzAeL0cdh>RhW5+4_mJZh+J9tjU@+EaL^=WCjt^QAsgFh}DQqNa zJkF-~ICDN|cgF`C+zI|O%H!f05M*ZJvkNj+rLYnF^V1jDDoLokokZ4zQa zAMNOa2c1IKNnxEN)j2|)P>7G~zst^xB(x4%fVG@y2A{&O{v-sk@@~Zc1oj5Lh!NZX w$;14d{4t^{>vJcT@>bb#tn5?Swe00=CY$xLUdD61E8er-E5%p67mK<71!Ju1 1: - yield (sims, n_dims) - else: - return sims + yield (sims, n_dims) + def column_similarities(mat): @@ -327,11 +316,11 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig else: # tf_fam = tf_weight.Norm05 df = df.withColumn("tf_idf", (0.5 + 0.5 * df.relative_tf) * df.idf) - df = df.repartition(400,'subreddit','week') + df = df.repartition('week') dfwriter = df.write.partitionBy("week") return dfwriter -def _calc_tfidf(df, term_colname, tf_family): +def _calc_tfidf(df, term_colname, tf_family, min_df=None, max_df=None): term = term_colname term_id = term + '_id' @@ -349,7 +338,13 @@ def _calc_tfidf(df, term_colname, tf_family): idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1) # collect the dictionary to make a pydict of terms to indexes - terms = idf.select(term).distinct() # terms are distinct + terms = idf + if min_df is not None: + terms = terms.filter(f.col('count')>=min_df) + if max_df is not None: + terms = terms.filter(f.col('count')<=max_df) + + terms = terms.select(term).distinct() # terms are distinct terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct # make subreddit ids @@ -359,12 +354,12 @@ def _calc_tfidf(df, term_colname, tf_family): df = df.join(subreddits,on='subreddit') # map terms to indexes in the tfs and the idfs - df = df.join(terms,on=term) # subreddit-term-id is unique + df = df.join(terms,on=term,how='inner') # subreddit-term-id is unique - idf = idf.join(terms,on=term) + idf = idf.join(terms,on=term,how='inner') # join on subreddit/term to create tf/dfs indexed by term - df = df.join(idf, on=[term_id, term]) + df = df.join(idf, on=[term_id, term],how='inner') # agg terms by subreddit to make sparse tf/df vectors if tf_family == tf_weight.MaxTF: @@ -375,19 +370,19 @@ def _calc_tfidf(df, term_colname, tf_family): return df -def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05): +def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, min_df=None, max_df=None): term = term_colname term_id = term + '_id' # aggregate counts by week. now subreddit-term is distinct df = df.filter(df.subreddit.isin(include_subs)) df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf')) - df = _calc_tfidf(df, term_colname, tf_family) + df = _calc_tfidf(df, term_colname, tf_family, min_df, max_df) df = df.repartition('subreddit') dfwriter = df.write return dfwriter -def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): +def select_topN_subreddits(topN, path="../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"): rankdf = pd.read_csv(path) included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values) return included_subreddits diff --git a/similarities/top_subreddits_by_comments.py b/similarities/top_subreddits_by_comments.py index 9a4d7d3..74ffb8d 100644 --- a/similarities/top_subreddits_by_comments.py +++ b/similarities/top_subreddits_by_comments.py @@ -1,16 +1,20 @@ from pyspark.sql import functions as f from pyspark.sql import SparkSession from pyspark.sql import Window +from datetime import datetime +from pathlib import Path spark = SparkSession.builder.getOrCreate() conf = spark.sparkContext.getConf() -submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet") +submissions = spark.read.parquet("../../data/reddit_submissions_by_subreddit.parquet") + +submissions = submissions.filter(f.col("CreatedAt") <= datetime(2020,4,13)) prop_nsfw = submissions.select(['subreddit','over_18']).groupby('subreddit').agg(f.mean(f.col('over_18').astype('double')).alias('prop_nsfw')) -df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") - +df = spark.read.parquet("../../data/reddit_comments_by_subreddit.parquet") +df = df.filter(f.col("CreatedAt") <= datetime(2020,4,13)) # remove /u/ pages df = df.filter(~df.subreddit.like("u_%")) @@ -26,4 +30,6 @@ df = df.toPandas() df = df.sort_values("n_comments") -df.to_csv('/gscratch/scrubbed/comdata/reddit_similarity/subreddits_by_num_comments_nonsfw.csv', index=False) +outpath = Path("../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv") +outpath.parent.mkdir(exist_ok=True, parents=True) +df.to_csv(str(outpath), index=False) diff --git a/similarities/wang_similarity.py b/similarities/wang_similarity.py deleted file mode 100644 index 452e07a..0000000 --- a/similarities/wang_similarity.py +++ /dev/null @@ -1,18 +0,0 @@ -from similarities_helper import similarities -import numpy as np -import fire - -def wang_similarity(mat): - non_zeros = (mat != 0).astype(np.float32) - intersection = non_zeros.T @ non_zeros - return intersection - - -infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather"; min_df=1; included_subreddits=None; topN=10000; exclude_phrases=False; from_date=None; to_date=None - -def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None): - - return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date) - -if __name__ == "__main__": - fire.Fire(wang_overlaps) diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py deleted file mode 100755 index 45327c7..0000000 --- a/similarities/weekly_cosine_similarities.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python3 -from pyspark.sql import functions as f -from pyspark.sql import SparkSession -from pyspark.sql import Window -import numpy as np -import pyarrow -import pyarrow.dataset as ds -import pandas as pd -import fire -from itertools import islice, chain -from pathlib import Path -from similarities_helper import pull_tfidf, column_similarities, write_weekly_similarities, lsi_column_similarities -from scipy.sparse import csr_matrix -from multiprocessing import Pool, cpu_count -from functools import partial -import pickle - -# tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity_weekly/comment_authors_tfidf.parquet" -# #tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data//comment_authors_compex.parquet" -# min_df=2 -# included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt" -# max_df = None -# topN=100 -# term_colname='author' -# # outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet' -# # included_subreddits=None -outfile="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity_weekly/comment_authors.parquet"; infile="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_authors_tfidf.parquet"; included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"; lsi_model="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI/2000_authors_LSIMOD.pkl"; n_components=1500; algorithm="randomized"; term_colname='author'; tfidf_path=infile; random_state=1968; - -# static_tfidf = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet" -# dftest = spark.read.parquet(static_tfidf) - -def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subreddits, outdir:Path, subreddit_names, nterms, topN=None, min_df=None, max_df=None): - term = term_colname - term_id = term + '_id' - term_id_new = term + '_id_new' - print(f"loading matrix: {week}") - - entries = pull_tfidf(infile = tfidf_path, - term_colname=term_colname, - included_subreddits=included_subreddits, - topN=topN, - week=week.isoformat(), - rescale_idf=False) - - tfidf_colname='tf_idf' - # if the max subreddit id we found is less than the number of subreddit names then we have to fill in 0s - mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)),shape=(nterms,subreddit_names.shape[0])) - print('computing similarities') - print(simfunc) - sims = simfunc(mat) - del mat - sims = next(sims)[0] - sims = pd.DataFrame(sims) - sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1) - sims['_subreddit'] = subreddit_names.subreddit.values - outfile = str(Path(outdir) / str(week)) - write_weekly_similarities(outfile, sims, week, subreddit_names) - -def pull_weeks(batch): - return set(batch.to_pandas()['week']) - -# This requires a prefit LSI model, since we shouldn't fit different LSI models for every week. -def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kwargs): - print(args) - print(kwargs) - term_colname= kwargs.get('term_colname') - # lsi_model = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI/1000_author_LSIMOD.pkl" - - lsi_model = pickle.load(open(lsi_model,'rb')) - #simfunc = partial(lsi_column_similarities,n_components=n_components,random_state=random_state,algorithm='randomized',lsi_model=lsi_model) - simfunc = partial(lsi_column_similarities,n_components=n_components,random_state=kwargs.get('random_state'),lsi_model=lsi_model) - - return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs) - -#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet') -def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None,max_df=None): - print(outfile) - # do this step in parallel if we have the memory for it. - # should be doable with pool.map - - spark = SparkSession.builder.getOrCreate() - df = spark.read.parquet(tfidf_path) - - # load subreddits + topN - - subreddit_names = df.select(['subreddit','subreddit_id']).distinct().toPandas() - subreddit_names = subreddit_names.sort_values("subreddit_id") - nterms = df.select(f.max(f.col(term_colname + "_id")).alias('max')).collect()[0].max - weeks = df.select(f.col("week")).distinct().toPandas().week.values - spark.stop() - - print(f"computing weekly similarities") - week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=None, subreddit_names=subreddit_names,nterms=nterms) - - for week in weeks: - week_similarities_helper(week) - # pool = Pool(cpu_count()) - - # list(pool.imap(week_similarities_helper, weeks)) - # pool.close() - # with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine? - - -def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500): - return cosine_similarities_weekly(infile, - outfile, - 'author', - max_df, - included_subreddits, - topN, - min_df=2 -) - -def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None): - return cosine_similarities_weekly(infile, - outfile, - 'term', - min_df, - max_df, - included_subreddits, - topN) - - -def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None): - return cosine_similarities_weekly_lsi(infile, - outfile, - 'author', - included_subreddits=included_subreddits, - n_components=n_components, - lsi_model=lsi_model - ) - - -def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None): - return cosine_similarities_weekly_lsi(infile, - outfile, - 'term', - included_subreddits=included_subreddits, - n_components=n_components, - lsi_model=lsi_model, - ) - -if __name__ == "__main__": - fire.Fire({'authors':author_cosine_similarities_weekly, - 'terms':term_cosine_similarities_weekly, - 'authors-lsi':author_cosine_similarities_weekly_lsi, - 'terms-lsi':term_cosine_similarities_weekly_lsi - }) - diff --git a/start_spark_and_run.sh b/start_spark_and_run.sh new file mode 100755 index 0000000..e1dcf6e --- /dev/null +++ b/start_spark_and_run.sh @@ -0,0 +1,21 @@ + +#!/usr/bin/env bash + +# Script to start a spark cluster and run a script on klone +source $SPARK_CONF_DIR/spark-env.sh +echo "#!/usr/bin/bash" > job_script.sh +echo "source ~/.bashrc" >> job_script.sh +echo "export PYSPARK_PYTHON=python3" >> job.script.sh +echo "export JAVA_HOME=/gscratch/comdata/local/open-jdk" >> job.script.sh +echo "export SPARK_CONF_DIR=/gscratch/comdata/local/spark_config" >> job.script.sh +echo "echo \$(hostname)" >> job_script.sh +echo "source $SPARK_CONF_DIR/spark-env.sh" >> job.script.sh +echo "start_spark_cluster.sh" >> job_script.sh +echo "spark-submit --verbose --master spark://\$(hostname):$SPARK_MASTER_PORT $2 ${@:3}" >> job_script.sh +echo "stop-all.sh" >> job_script.sh +#echo "singularity instance stop --all" >> job_script.sh +chmod +x job_script.sh + +let "cpus = $1 * 40" +salloc -p compute-bigmem -A comdata --nodes=$1 --time=48:00:00 -c 40 --mem=362G --exclusive srun -n1 job_script.sh + diff --git a/start_spark_cluster.sh b/start_spark_cluster.sh new file mode 100755 index 0000000..c6c0ea4 --- /dev/null +++ b/start_spark_cluster.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +nodes="$(scontrol show hostnames)" + +export SPARK_MASTER_HOST=$(hostname) +echo $SPARK_MASTER_HOST +# singularity instance stop spark-boss +# rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/spark-boss + +# for node in $nodes +# dol +# echo $node +# ssh $node "singularity instance stop --all -F" +# done + +# singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif spark-boss +#apptainer exec /gscratch/comdata/users/nathante/containers/nathante.sif +start-master.sh +for node in $nodes +do + # if [ "$node" != "$SPARK_BOSS" ] + # then + echo $node + ssh -t $node start_spark_worker.sh $SPARK_MASTER_HOST + # fi +done + diff --git a/start_spark_worker.sh b/start_spark_worker.sh new file mode 100755 index 0000000..a343a31 --- /dev/null +++ b/start_spark_worker.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# runs on worker node +# instance_name=spark-worker-$(hostname) +# echo $hostname +# instance_url="instance://$instance_name" +# singularity instance list +# singularity instance stop -F "$instance_name" +# singularity instance list +# sleep 5 +# ls $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name +# rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name +# singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif $instance_name +source /gscratch/comdata/env/cdsc_klone_bashrc +source $SPARK_CONF_DIR/spark-env.sh +echo $(which python3) +echo $PYSPARK_PYTHON +echo "start-worker.sh spark://$1:$SPARK_MASTER_PORT" +start-worker.sh spark://$1:$SPARK_MASTER_PORT diff --git a/timeseries/__init__.py b/timeseries/__init__.py deleted file mode 100644 index c023c66..0000000 --- a/timeseries/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .choose_clusters import load_clusters, load_densities -from .cluster_timeseries import build_cluster_timeseries diff --git a/timeseries/choose_clusters.py b/timeseries/choose_clusters.py deleted file mode 100644 index c801379..0000000 --- a/timeseries/choose_clusters.py +++ /dev/null @@ -1,96 +0,0 @@ -from pyarrow import dataset as ds -import numpy as np -import pandas as pd -import plotnine as pn -random = np.random.RandomState(1968) - -def load_densities(term_density_file="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather", - author_density_file="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather"): - - term_density = pd.read_feather(term_density_file) - author_density = pd.read_feather(author_density_file) - - term_density.rename({'overlap_density':'term_density','index':'subreddit'},axis='columns',inplace=True) - author_density.rename({'overlap_density':'author_density','index':'subreddit'},axis='columns',inplace=True) - - density = term_density.merge(author_density,on='subreddit',how='inner') - - return density - -def load_clusters(term_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather", - author_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather"): - term_clusters = pd.read_feather(term_clusters_file) - author_clusters = pd.read_feather(author_clusters_file) - - # rename, join and return - term_clusters.rename({'cluster':'term_cluster'},axis='columns',inplace=True) - author_clusters.rename({'cluster':'author_cluster'},axis='columns',inplace=True) - - clusters = term_clusters.merge(author_clusters,on='subreddit',how='inner') - - return clusters - -if __name__ == '__main__': - - df = load_densities() - cl = load_clusters() - - df['td_rank'] = df.term_density.rank() - df['ad_rank'] = df.author_density.rank() - - df['td_percentile'] = df.td_rank / df.shape[0] - df['ad_percentile'] = df.ad_rank / df.shape[0] - - df = df.merge(cl, on='subreddit',how='inner') - - term_cluster_density = df.groupby('term_cluster').agg({'td_rank':['mean','min','max'], - 'ad_rank':['mean','min','max'], - 'td_percentile':['mean','min','max'], - 'ad_percentile':['mean','min','max'], - 'subreddit':['count']}) - - - author_cluster_density = df.groupby('author_cluster').agg({'td_rank':['mean','min','max'], - 'ad_rank':['mean','min','max'], - 'td_percentile':['mean','min','max'], - 'ad_percentile':['mean','min','max'], - 'subreddit':['count']}) - - # which clusters have the most term_density? - term_cluster_density.iloc[term_cluster_density.td_rank['mean'].sort_values().index] - - # which clusters have the most author_density? - term_cluster_density.iloc[term_cluster_density.ad_rank['mean'].sort_values(ascending=False).index].loc[term_cluster_density.subreddit['count'] >= 5][0:20] - - high_density_term_clusters = term_cluster_density.loc[(term_cluster_density.td_percentile['mean'] > 0.75) & (term_cluster_density.subreddit['count'] > 5)] - - # let's just use term density instead of author density for now. We can do a second batch with author density next. - chosen_clusters = high_density_term_clusters.sample(3,random_state=random) - - cluster_info = df.loc[df.term_cluster.isin(chosen_clusters.index.values)] - - chosen_subreddits = cluster_info.subreddit.values - - dataset = ds.dataset("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",format='parquet') - comments = dataset.to_table(filter=ds.field("subreddit").isin(chosen_subreddits),columns=['id','subreddit','author','CreatedAt']) - - comments = comments.to_pandas() - - comments['week'] = comments.CreatedAt.dt.date - pd.to_timedelta(comments['CreatedAt'].dt.dayofweek, unit='d') - - author_timeseries = comments.loc[:,['subreddit','author','week']].drop_duplicates().groupby(['subreddit','week']).count().reset_index() - - for clid in chosen_clusters.index.values: - - ts = pd.read_feather(f"data/ts_term_cluster_{clid}.feather") - - pn.options.figure_size = (11.7,8.27) - p = pn.ggplot(ts) - p = p + pn.geom_line(pn.aes('week','value',group='subreddit')) - p = p + pn.facet_wrap('~ subreddit') - p.save(f"plots/ts_term_cluster_{clid}.png") - - - fig, ax = pyplot.subplots(figsize=(11.7,8.27)) - g = sns.FacetGrid(ts,row='subreddit') - g.map_dataframe(sns.scatterplot,'week','value',data=ts,ax=ax) diff --git a/timeseries/cluster_timeseries.py b/timeseries/cluster_timeseries.py deleted file mode 100644 index 2286ab0..0000000 --- a/timeseries/cluster_timeseries.py +++ /dev/null @@ -1,37 +0,0 @@ -import pandas as pd -import numpy as np -from pyspark.sql import functions as f -from pyspark.sql import SparkSession -from .choose_clusters import load_clusters, load_densities -import fire -from pathlib import Path - -def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather", - author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather", - term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather", - author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", - output="data/subreddit_timeseries.parquet"): - - spark = SparkSession.builder.getOrCreate() - - df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet") - - df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt"))) - - # time of unique authors by series by week - ts = df.select(['subreddit','week','author']).distinct().groupby(['subreddit','week']).count() - - ts = ts.repartition('subreddit') - - if term_densities_path is not None and author_densities_path is not None: - densities = load_densities(term_densities_path, author_densities_path) - spk_densities = spark.createDataFrame(densities) - ts = ts.join(spk_densities, on='subreddit', how='inner') - - clusters = load_clusters(term_clusters_path, author_clusters_path) - spk_clusters = spark.createDataFrame(clusters) - ts = ts.join(spk_clusters, on='subreddit', how='inner') - ts.write.parquet(output, mode='overwrite') - -if __name__ == "__main__": - fire.Fire(build_cluster_timeseries) diff --git a/tsne_subreddit_fit.feather b/tsne_subreddit_fit.feather deleted file mode 100644 index 74f6d8c..0000000 --- a/tsne_subreddit_fit.feather +++ /dev/null @@ -1 +0,0 @@ -/annex/objects/SHA256E-s60874--d536adb0ec637fca262c4e1ec908dd8b4a5d1464047b583cd1a99cc6dba87191 diff --git a/visualization/Makefile b/visualization/Makefile deleted file mode 100644 index 97a7038..0000000 --- a/visualization/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -all: subreddit_author_tf_similarities_10000.html #comment_authors_10000.html - -# wang_tsne_10000.html -# wang_tsne_10000.html:/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather tsne_vis.py -# python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather --output=wang_tsne_10000.html - -# comment_authors_10000.html:/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather tsne_vis.py -# python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather --output=comment_authors_10000.html - -subreddit_author_tf_similarities_10000.html:/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather tsne_vis.py - start_spark_and_run.sh 1 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather --output=subreddit_author_tf_similarities_10000.html diff --git a/visualization/data/term_affinityprop_10000.feather b/visualization/data/term_affinityprop_10000.feather deleted file mode 120000 index 188939f..0000000 --- a/visualization/data/term_affinityprop_10000.feather +++ /dev/null @@ -1 +0,0 @@ -../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784 \ No newline at end of file diff --git a/visualization/data/term_affinityprop_3000.feather b/visualization/data/term_affinityprop_3000.feather deleted file mode 120000 index c9b4233..0000000 --- a/visualization/data/term_affinityprop_3000.feather +++ /dev/null @@ -1 +0,0 @@ -../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e \ No newline at end of file diff --git a/visualization/data/term_tsne_10000.feather b/visualization/data/term_tsne_10000.feather deleted file mode 120000 index 764f2e0..0000000 --- a/visualization/data/term_tsne_10000.feather +++ /dev/null @@ -1 +0,0 @@ -../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543 \ No newline at end of file diff --git a/visualization/data/term_tsne_3000.feather b/visualization/data/term_tsne_3000.feather deleted file mode 120000 index 21f156f..0000000 --- a/visualization/data/term_tsne_3000.feather +++ /dev/null @@ -1 +0,0 @@ -../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf \ No newline at end of file diff --git a/visualization/subreddit_author_tf_similarities_10000.html b/visualization/subreddit_author_tf_similarities_10000.html deleted file mode 100644 index eac12c5..0000000 --- a/visualization/subreddit_author_tf_similarities_10000.html +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - -
- - - \ No newline at end of file diff --git a/visualization/subreddit_author_tf_similarities_10000_viewport.html b/visualization/subreddit_author_tf_similarities_10000_viewport.html deleted file mode 100644 index c2e9a33..0000000 --- a/visualization/subreddit_author_tf_similarities_10000_viewport.html +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - - - - -
- - - \ No newline at end of file diff --git a/visualization/tsne_vis.py b/visualization/tsne_vis.py deleted file mode 100644 index eb6a6be..0000000 --- a/visualization/tsne_vis.py +++ /dev/null @@ -1,187 +0,0 @@ -import pyarrow -import altair as alt -alt.data_transformers.disable_max_rows() -alt.data_transformers.enable('default') -from sklearn.neighbors import NearestNeighbors -import pandas as pd -from numpy import random -import fire -import numpy as np - -def base_plot(plot_data): - -# base = base.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10'))) - - cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))]) - - # subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit)) - - cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ') - # cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster') - # cluster_select_and = cluster_click_select & cluster_select - # - # subreddit_select = alt.selection_single(on='click',fields=['subreddit'],bind=subreddit_dropdown,name='subreddit_click') - - base_scale = alt.Scale(scheme={"name":'category10', - "extent":[0,100], - "count":10}) - - color = alt.condition(cluster_click_select , - alt.Color(field='color',type='nominal',scale=base_scale), - alt.value("lightgray")) - - - base = alt.Chart(plot_data).mark_text().encode( - alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))), - alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))), - color=color, - text='subreddit') - - base = base.add_selection(cluster_click_select) - - - return base - -def zoom_plot(plot_data): - chart = base_plot(plot_data) - - chart = chart.interactive() - chart = chart.properties(width=1275,height=800) - - return chart - -def viewport_plot(plot_data): - selector1 = alt.selection_interval(encodings=['x','y'],init={'x':(-65,65),'y':(-65,65)}) - selectorx2 = alt.selection_interval(encodings=['x'],init={'x':(30,40)}) - selectory2 = alt.selection_interval(encodings=['y'],init={'y':(-20,0)}) - - base = base_plot(plot_data) - - viewport = base.mark_point(fillOpacity=0.2,opacity=0.2).encode( - alt.X('x',axis=alt.Axis(grid=False)), - alt.Y('y',axis=alt.Axis(grid=False)), - ) - - viewport = viewport.properties(width=600,height=400) - - viewport1 = viewport.add_selection(selector1) - - viewport2 = viewport.encode( - alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1)), - alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1)) - ) - - viewport2 = viewport2.add_selection(selectorx2) - viewport2 = viewport2.add_selection(selectory2) - - sr = base.encode(alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectorx2)), - alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectory2)) - ) - - - sr = sr.properties(width=1275,height=600) - - - chart = (viewport1 | viewport2) & sr - - - return chart - -def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4): - isolate_color = 101 - - cluster_sizes = clusters.groupby('cluster').count() - singletons = set(cluster_sizes.loc[cluster_sizes.subreddit == 1].reset_index().cluster) - - tsne_data = tsne_data.merge(clusters,on='subreddit') - - centroids = tsne_data.groupby('cluster').agg({'x':np.mean,'y':np.mean}) - - color_ids = np.arange(n_colors) - - distances = np.empty(shape=(centroids.shape[0],centroids.shape[0])) - - groups = tsne_data.groupby('cluster') - - points = np.array(tsne_data.loc[:,['x','y']]) - centers = np.array(centroids.loc[:,['x','y']]) - - # point x centroid - point_center_distances = np.linalg.norm((points[:,None,:] - centers[None,:,:]),axis=-1) - - # distances is cluster x point - for gid, group in groups: - c_dists = point_center_distances[group.index.values,:].min(axis=0) - distances[group.cluster.values[0],] = c_dists - - # nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(centroids) - # distances, indices = nbrs.kneighbors() - - nearest = distances.argpartition(n_neighbors,0) - indices = nearest[:n_neighbors,:].T - # neighbor_distances = np.copy(distances) - # neighbor_distances.sort(0) - # neighbor_distances = neighbor_distances[0:n_neighbors,:] - - # nbrs = NearestNeighbors(n_neighbors=n_neighbors,metric='precomputed').fit(distances) - # distances, indices = nbrs.kneighbors() - - color_assignments = np.repeat(-1,len(centroids)) - - for i in range(len(centroids)): - if (centroids.iloc[i].name == -1) or (i in singletons): - color_assignments[i] = isolate_color - else: - knn = indices[i] - knn_colors = color_assignments[knn] - available_colors = color_ids[list(set(color_ids) - set(knn_colors))] - - if(len(available_colors) > 0): - color_assignments[i] = available_colors[0] - else: - raise Exception("Can't color this many neighbors with this many colors") - - centroids = centroids.reset_index() - colors = centroids.loc[:,['cluster']] - colors['color'] = color_assignments - - tsne_data = tsne_data.merge(colors,on='cluster') - return(tsne_data) - -def build_visualization(tsne_data, clusters, output): - - # tsne_data = "/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather" - # clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather" - - tsne_data = pd.read_feather(tsne_data) - tsne_data = tsne_data.rename(columns={'_subreddit':'subreddit'}) - clusters = pd.read_feather(clusters) - - tsne_data = assign_cluster_colors(tsne_data,clusters,10,8) - - sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index() - sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'}) - - tsne_data = tsne_data.merge(sr_per_cluster,on='cluster') - - term_zoom_plot = zoom_plot(tsne_data) - - term_zoom_plot.save(output) - - term_viewport_plot = viewport_plot(tsne_data) - - term_viewport_plot.save(output.replace(".html","_viewport.html")) - -if __name__ == "__main__": - fire.Fire(build_visualization) - -# commenter_data = pd.read_feather("tsne_author_fit.feather") -# clusters = pd.read_feather('author_3000_clusters.feather') -# commenter_data = assign_cluster_colors(commenter_data,clusters,10,8) -# commenter_zoom_plot = zoom_plot(commenter_data) -# commenter_viewport_plot = viewport_plot(commenter_data) -# commenter_zoom_plot.save("subreddit_commenters_tsne_3000.html") -# commenter_viewport_plot.save("subreddit_commenters_tsne_3000_viewport.html") - -# chart = chart.properties(width=10000,height=10000) -# chart.save("test_tsne_whole.svg") From 9345f9de9437d5965ad4ee5874bc24199e077d48 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Wed, 31 May 2023 09:47:21 -0700 Subject: [PATCH 22/22] make pass keyword arg to dataframe.drop --- clustering/clustering_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clustering/clustering_base.py b/clustering/clustering_base.py index 98a260e..2f37b68 100644 --- a/clustering/clustering_base.py +++ b/clustering/clustering_base.py @@ -68,7 +68,7 @@ class clustering_job: def read_distance_mat(self, similarities, use_threads=True): print(similarities) df = pd.read_feather(similarities, use_threads=use_threads) - mat = np.array(df.drop('_subreddit',1)) + mat = np.array(df.drop('_subreddit',axis=1)) n = mat.shape[0] mat[range(n),range(n)] = 1 return (df._subreddit,1-mat)