Merge remote-tracking branch 'origin/icwsm_dataverse'

2024-12-12 07:45:06 -08:00 · 2024-12-12 07:45:06 -08:00 · 3d192ab82f
commit 3d192ab82f
parent e2b6c1b481 9345f9de94
72 changed files with 2087 additions and 1253 deletions
--- a/init.py
+++ b/init.py
@ -0,0 +1,2 @@
 from .timeseries import load_clusters, load_densities, build_cluster_timeseries
--- a/bots/good_bad_bot.py
+++ b/bots/good_bad_bot.py
@ -1,74 +0,0 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.sql.types import FloatType
 import zlib
 def zlib_entropy_rate(s):
    sb = s.encode()
    if len(sb) == 0:
        return None
    else:
        return len(zlib.compress(s.encode(),level=6))/len(s.encode())
 zlib_entropy_rate_udf = f.udf(zlib_entropy_rate,FloatType())
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_author.parquet",compression='snappy')
 df = df.withColumn("saidbot",f.lower(f.col("body")).like("%bot%"))
 # df = df.filter(df.subreddit=='seattle')
 # df = df.cache()
 botreplies = df.filter(f.lower(df.body).rlike(".*[good|bad] bot.*"))
 botreplies = botreplies.select([f.col("parent_id").substr(4,100).alias("bot_comment_id"),f.lower(f.col("body")).alias("good_bad_bot"),f.col("link_id").alias("gbbb_link_id")])
 botreplies = botreplies.groupby(['bot_comment_id']).agg(f.count('good_bad_bot').alias("N_goodbad_votes"),
                                                        f.sum((f.lower(f.col('good_bad_bot')).like('%good bot%').astype("double"))).alias("n_good_votes"),
                                                        f.sum((f.lower(f.col('good_bad_bot')).like('%bad bot%').astype("double"))).alias("n_bad_votes"))
 comments_by_author = df.select(['author','id','saidbot']).groupBy('author').agg(f.count('id').alias("N_comments"),
                                                                                f.mean(f.col('saidbot').astype("double")).alias("prop_saidbot"),
                                                                                f.sum(f.col('saidbot').astype("double")).alias("n_saidbot"))
 # pd_comments_by_author = comments_by_author.toPandas()
 # pd_comments_by_author['frac'] = 500 / pd_comments_by_author['N_comments']
 # pd_comments_by_author.loc[pd_comments_by_author.frac > 1, 'frac'] = 1
 # fractions = pd_comments_by_author.loc[:,['author','frac']]
 # fractions = fractions.set_index('author').to_dict()['frac']
 # sampled_author_comments = df.sampleBy("author",fractions).groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
 df = df.withColumn("randn",f.randn(seed=1968))
 win = Window.partitionBy("author").orderBy("randn")
 df = df.withColumn("randRank",f.rank().over(win))
 sampled_author_comments = df.filter(f.col("randRank") <= 1000)
 sampled_author_comments = sampled_author_comments.groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
 author_entropy_rates = sampled_author_comments.select(['author',zlib_entropy_rate_udf(f.col('comments')).alias("entropy_rate")])
 parents = df.join(botreplies, on=df.id==botreplies.bot_comment_id,how='right_outer')
 win1 = Window.partitionBy("author")
 parents = parents.withColumn("first_bot_reply",f.min(f.col("CreatedAt")).over(win1))
 first_bot_reply = parents.filter(f.col("first_bot_reply")==f.col("CreatedAt"))
 first_bot_reply = first_bot_reply.withColumnRenamed("CreatedAt","FB_CreatedAt")
 first_bot_reply = first_bot_reply.withColumnRenamed("id","FB_id")
 comments_since_first_bot_reply = df.join(first_bot_reply,on = 'author',how='right_outer').filter(f.col("CreatedAt")>=f.col("first_bot_reply"))
 comments_since_first_bot_reply = comments_since_first_bot_reply.groupBy("author").agg(f.count("id").alias("N_comments_since_firstbot"))
 bots = parents.groupby(['author']).agg(f.sum('N_goodbad_votes').alias("N_goodbad_votes"),
                                          f.sum(f.col('n_good_votes')).alias("n_good_votes"),
                                          f.sum(f.col('n_bad_votes')).alias("n_bad_votes"),
                                          f.count(f.col('author')).alias("N_bot_posts"))
 bots = bots.join(comments_by_author,on="author",how='left_outer')
 bots = bots.join(comments_since_first_bot_reply,on="author",how='left_outer')
 bots = bots.join(author_entropy_rates,on='author',how='left_outer')
 bots = bots.orderBy("N_goodbad_votes",ascending=False)
 bots = bots.repartition(1)
 bots.write.parquet("/gscratch/comdata/output/reddit_good_bad_bot.parquet",mode='overwrite')
--- a/clustering/Makefile
+++ b/clustering/Makefile
@ -1,55 +1,36 @@
-#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
+srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 /bin/bash -c 
-srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
+similarity_data=../../data/reddit_similarity
-similarity_data=/gscratch/comdata/output/reddit_similarity
+clustering_data=../../data/reddit_clustering
-clustering_data=/gscratch/comdata/output/reddit_clustering
+kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
-selection_grid="--max_iter=3000 --convergence_iter=15,30,100 --damping=0.5,0.6,0.7,0.8,0.85,0.9,0.95,0.97,0.99, --preference_quantile=0.1,0.3,0.5,0.7,0.9"
+hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
-#selection_grid="--max_iter=3000 --convergence_iter=[15] --preference_quantile=[0.5] --damping=[0.99]"
+affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
 all:$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv
 # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
-$(clustering_data)/subreddit_comment_authors_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_authors_10k.feather clustering.py
+authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
-	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_10k.feather $(clustering_data)/subreddit_comment_authors_10k $(clustering_data)/subreddit_comment_authors_10k/selection_data.csv $(selection_grid) -J 20
+authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
-$(clustering_data)/subreddit_comment_terms_10k/selection_data.csv:selection.py $(similarity_data)/subreddit_comment_terms_10k.feather clustering.py
+all:authors_tf_10k_lsi
 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_10k.feather $(clustering_data)/subreddit_comment_terms_10k  $(clustering_data)/subreddit_comment_terms_10k/selection_data.csv $(selection_grid) -J 20 
-$(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv:clustering.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather
+authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_10k.feather $(clustering_data)/subreddit_comment_authors-tf_10k  $(clustering_data)/subreddit_comment_authors-tf_10k/selection_data.csv $(selection_grid) -J 20
-# $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
+## LSI Models
-# 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
+${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) -c "source ~/.bashrc; python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)"
-# $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
+${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
-# 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
+	$(srun_singularity) -c "source ~/.bashrc; python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)"
-# $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
+${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
-# 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
+	$(srun_singularity) -c "source ~/.bashrc; python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)"
 ${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
 	$(srun_singularity) -c "source ~/.bashrc; python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2"
-# $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather
+${authors_tf_10k_input_lsi}:
-# 	 $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather $(clustering_data)/subreddit_comment_authors_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85
+	$(MAKE) -C ../similarities
-# $(clustering_data)/comment_terms_100k.feather:clustering.py $(similarity_data)/subreddit_comment_terms_100k.feather
+clean:
-# 	$(srun_singularity) python3 clustering.py $(similarity_data)/comment_terms_10000.feather $(clustering_data)/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5
+	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
-# $(clustering_data)/subreddit_comment_author-tf_100k.feather:clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.feather
+PHONY: clean 
 # 	$(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.parquet $(clustering_data)/subreddit_comment_author-tf_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85
 # it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap
 # /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
 # 	./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85
 # /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
 # 	start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather
 # /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
 # 	python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather
 # /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
 # #	$srun_cdsc python3
 # 	start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather
--- a/clustering/affinity/subreddit_comment_authors_10000_a.feather
+++ b/clustering/affinity/subreddit_comment_authors_10000_a.feather
--- a/clustering/affinity_clustering.py
+++ b/clustering/affinity_clustering.py
@ -0,0 +1,129 @@
 from sklearn.cluster import AffinityPropagation
 from dataclasses import dataclass
 from clustering_base import clustering_result, clustering_job
 from grid_sweep import grid_sweep
 from pathlib import Path
 from itertools import product, starmap
 import fire
 import sys
 import numpy as np
 # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
@dataclass
 class affinity_clustering_result(clustering_result):
    damping:float
    convergence_iter:int
    preference_quantile:float
    preference:float
    max_iter:int
 class affinity_job(clustering_job):
    def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
        super().__init__(infile,
                         outpath,
                         name,
                         call=self._affinity_clustering,
                         preference_quantile=preference_quantile,
                         damping=damping,
                         max_iter=max_iter,
                         convergence_iter=convergence_iter,
                         random_state=1968,
                         verbose=verbose)
        self.damping=damping
        self.max_iter=max_iter
        self.convergence_iter=convergence_iter
        self.preference_quantile=preference_quantile
    def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs):
        mat = 1-mat
        preference = np.quantile(mat, preference_quantile)
        self.preference = preference
        print(f"preference is {preference}")
        print("data loaded")
        sys.stdout.flush()
        clustering = AffinityPropagation(*args,
                                         preference=preference,
                                         affinity='precomputed',
                                         copy=False,
                                         **kwargs).fit(mat)
        return clustering
    def get_info(self):
        result = super().get_info()
        self.result=affinity_clustering_result(**result.__dict__,
                                               damping=self.damping,
                                               max_iter=self.max_iter,
                                               convergence_iter=self.convergence_iter,
                                               preference_quantile=self.preference_quantile,
                                               preference=self.preference)
        return self.result
 class affinity_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):
        super().__init__(affinity_job,
                         _afffinity_grid_sweep,
                         inpath,
                         outpath,
                         self.namer,
                         *args,
                         **kwargs)
    def namer(self,
              damping,
              max_iter,
              convergence_iter,
              preference_quantile):
        return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
 def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5],n_cores=10):
    """Run affinity clustering once or more with different parameters.
    Usage:
    affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv>
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. 
    preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
    convergence_iters:one or more integers of number of iterations without improvement before stopping.
    max_iters: one or more numbers of different maximum interations.
    """
    obj = affinity_grid_sweep(inpath,
                         outpath,
                         map(float,dampings),
                         map(int,max_iters),
                         map(int,convergence_iters),
                         map(float,preference_quantiles))
    obj.run(n_cores)
    obj.save(savefile)
 def test_select_affinity_clustering():
    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
    #                           "test_hdbscan_author30k",
    #                           min_cluster_sizes=[2],
    #                           min_samples=[1,2],
    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
    #                           cluster_selection_methods=['eom','leaf'],
    #                           lsi_dimensions='all')
    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
    outpath = "test_affinity";
    dampings=[0.8,0.9]
    max_iters=[100000]
    convergence_iters=[15]
    preference_quantiles=[0.5,0.7]
    gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles)
    gs.run(20)
    gs.save("test_affinity/lsi_sweep.csv")
 if __name__ == "__main__":
    fire.Fire(run_affinity_grid_sweep)
--- a/clustering/affinity_clustering_lsi.py
+++ b/clustering/affinity_clustering_lsi.py
@ -0,0 +1,99 @@
 import fire
 from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
 from grid_sweep import grid_sweep
 from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
 from dataclasses import dataclass
@dataclass
 class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
    pass
 class affinity_lsi_job(affinity_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        result = super().get_info()
        self.result = affinity_clustering_result_lsi(**result.__dict__,
                                                     lsi_dimensions=self.lsi_dims)
        return self.result
 class affinity_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 dampings=[0.9],
                 max_iters=[10000],
                 convergence_iters=[30],
                 preference_quantiles=[0.5]):
        super().__init__(affinity_lsi_job,
                         _affinity_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         dampings,
                         max_iters,
                         convergence_iters,
                         preference_quantiles)
 class _affinity_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):
        self.lsi_dim = lsi_dim
        self.jobtype = affinity_lsi_job
        super().__init__(self.jobtype,
                         inpath,
                         outpath,
                         self.namer,
                         [self.lsi_dim],
                         *args,
                         **kwargs)
    def namer(self, *args, **kwargs):
        s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30):
    """Run affinity clustering once or more with different parameters.
    Usage:
    affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. 
    preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
    convergence_iters:one or more integers of number of iterations without improvement before stopping.
    max_iters: one or more numbers of different maximum interations.
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    """
    obj = affinity_lsi_grid_sweep(inpath,
                            lsi_dimensions,
                            outpath,
                            map(float,dampings),
                            map(int,max_iters),
                            map(int,convergence_iters),
                            map(float,preference_quantiles))
    obj.run(n_cores)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_affinity_lsi_grid_sweep)
--- a/clustering/clustering.py
+++ b/clustering/clustering.py
@ -6,21 +6,20 @@ import numpy as np
 from sklearn.cluster import AffinityPropagation
 import fire
 from pathlib import Path
 from multiprocessing import cpu_count
 from dataclasses import dataclass
 from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
-def read_similarity_mat(similarities, use_threads=True):
+def affinity_clustering(similarities, output, *args, **kwargs):
    df = pd.read_feather(similarities, use_threads=use_threads)
    mat = np.array(df.drop('_subreddit',1))
    n = mat.shape[0]
    mat[range(n),range(n)] = 1
    return (df._subreddit,mat)
 def affinity_clustering(similarities, *args, **kwargs):
    subreddits, mat = read_similarity_mat(similarities)
-    return _affinity_clustering(mat, subreddits, *args, **kwargs)
+    clustering = _affinity_clustering(mat, *args, **kwargs)
    cluster_data = process_clustering_result(clustering, subreddits)
    cluster_data['algorithm'] = 'affinity'
    return(cluster_data)
 def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
    '''
-    similarities: feather file with a dataframe of similarity scores
+    similarities: matrix of similarity scores
    preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
    damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. 
    '''
@ -40,25 +39,14 @@ def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000,
                                     verbose=verbose,
                                     random_state=random_state).fit(mat)
-
+    cluster_data = process_clustering_result(clustering, subreddits)
-    print(f"clustering took {clustering.n_iter_} iterations")
+    output = Path(output)
-    clusters = clustering.labels_
+    output.parent.mkdir(parents=True,exist_ok=True)
    print(f"found {len(set(clusters))} clusters")
    cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
    cluster_sizes = cluster_data.groupby("cluster").count()
    print(f"the largest cluster has {cluster_sizes.subreddit.max()} members")
    print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
    print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
    sys.stdout.flush()
    cluster_data.to_feather(output)
    print(f"saved {output}")
    return clustering
 if __name__ == "__main__":
    fire.Fire(affinity_clustering)
--- a/clustering/clustering_base.py
+++ b/clustering/clustering_base.py
@ -0,0 +1,151 @@
 import pickle
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from dataclasses import dataclass
 from sklearn.metrics import silhouette_score, silhouette_samples
 from collections import Counter
 # this is meant to be an interface, not created directly
 class clustering_job:
    def __init__(self, infile, outpath, name, call, *args, **kwargs):
        self.outpath = Path(outpath)
        self.call = call
        self.args = args
        self.kwargs = kwargs
        self.infile = Path(infile)
        self.name = name
        self.hasrun = False
    def run(self):
        self.subreddits, self.mat = self.read_distance_mat(self.infile)
        self.clustering = self.call(self.mat, *self.args, **self.kwargs)
        self.cluster_data = self.process_clustering(self.clustering, self.subreddits)
        self.outpath.mkdir(parents=True, exist_ok=True)
        self.cluster_data.to_feather(self.outpath/(self.name + ".feather"))
        self.hasrun = True
        self.cleanup()
    def cleanup(self):
        self.cluster_data = None
        self.mat = None
        self.clustering=None
        self.subreddits=None
    def get_info(self):
        if not self.hasrun:
            self.run()
        self.result = clustering_result(outpath=str(self.outpath.resolve()),
                                        silhouette_score=self.score,
                                        name=self.name,
                                        n_clusters=self.n_clusters,
                                        n_isolates=self.n_isolates,
                                        silhouette_samples = self.silsampout
                                        )
        return self.result
    def silhouette(self):
        counts = Counter(self.clustering.labels_)
        singletons = [key for key, value in counts.items() if value == 1]
        isolates = (self.clustering.labels_ == -1) | (np.isin(self.clustering.labels_,np.array(singletons)))
        scoremat = self.mat[~isolates][:,~isolates]
        if self.n_clusters > 1:
            score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
            silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
            silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
            self.outpath.mkdir(parents=True, exist_ok=True)
            silsampout = self.outpath / ("silhouette_samples-" + self.name +  ".feather")
            self.silsampout = silsampout.resolve()
            silhouette_samp.to_feather(self.silsampout)
        else:
            score = None
            self.silsampout = None
        return score
    def read_distance_mat(self, similarities, use_threads=True):
        print(similarities)
        df = pd.read_feather(similarities, use_threads=use_threads)
        mat = np.array(df.drop('_subreddit',axis=1))
        n = mat.shape[0]
        mat[range(n),range(n)] = 1
        return (df._subreddit,1-mat)
    def process_clustering(self, clustering, subreddits):
        if hasattr(clustering,'n_iter_'):
            print(f"clustering took {clustering.n_iter_} iterations")
        clusters = clustering.labels_
        self.n_clusters = len(set(clusters))
        print(f"found {self.n_clusters} clusters")
        cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
        self.score = self.silhouette()
        print(f"silhouette_score:{self.score}")
        cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
        print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
        print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
        n_isolates1 = (cluster_sizes.subreddit==1).sum()
        print(f"{n_isolates1} clusters have 1 member")
        n_isolates2 = cluster_sizes.loc[cluster_sizes.cluster==-1,:]['subreddit'].to_list()
        if len(n_isolates2) > 0:
            n_isloates2 = n_isolates2[0]
        print(f"{n_isolates2} subreddits are in cluster -1",flush=True)
        if n_isolates1 == 0:
            self.n_isolates = n_isolates2
        else:
            self.n_isolates = n_isolates1
        return cluster_data
 class twoway_clustering_job(clustering_job):
    def __init__(self, infile, outpath, name, call1, call2, args1, args2):
        self.outpath = Path(outpath)
        self.call1 = call1
        self.args1 = args1
        self.call2 = call2
        self.args2 = args2
        self.infile = Path(infile)
        self.name = name
        self.hasrun = False
        self.args = args1|args2
    def run(self):
        self.subreddits, self.mat = self.read_distance_mat(self.infile)
        self.step1 = self.call1(self.mat, **self.args1)
        self.clustering = self.call2(self.mat, self.step1, **self.args2)
        self.cluster_data = self.process_clustering(self.clustering, self.subreddits)
        self.hasrun = True
        self.after_run()
        self.cleanup()
    def after_run(self):
        self.score = self.silhouette()
        self.outpath.mkdir(parents=True, exist_ok=True)
        print(self.outpath/(self.name+".feather"))
        self.cluster_data.to_feather(self.outpath/(self.name + ".feather"))
    def cleanup(self):
        super().cleanup()
        self.step1 = None
@dataclass
 class clustering_result:
    outpath:Path
    silhouette_score:float
    name:str
    n_clusters:int
    n_isolates:int
    silhouette_samples:str
--- a/clustering/fit_tsne.py
+++ b/clustering/fit_tsne.py
@ -1,34 +0,0 @@
 import fire
 import pyarrow
 import pandas as pd
 from numpy import random
 import numpy as np
 from sklearn.manifold import TSNE
 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet"
 def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
    '''
    similarities: feather file with a dataframe of similarity scores
    learning_rate: parameter controlling how fast the model converges. Too low and you get outliers. Too high and you get a ball.
    perplexity: number of neighbors to use. the default of 50 is often good.
    '''
    df = pd.read_feather(similarities)
    n = df.shape[0]
    mat = np.array(df.drop('subreddit',1),dtype=np.float64)
    mat[range(n),range(n)] = 1
    mat[mat > 1] = 1
    dist = 2*np.arccos(mat)/np.pi
    tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1)
    tsne_fit_model = tsne_model.fit(dist)
    tsne_fit_whole = tsne_fit_model.fit_transform(dist)
    plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit})
    plot_data.to_feather(output)
 if __name__ == "__main__":
    fire.Fire(fit_tsne)
--- a/clustering/grid_sweep.py
+++ b/clustering/grid_sweep.py
@ -0,0 +1,49 @@
 from pathlib import Path
 from multiprocessing import Pool, cpu_count
 from itertools import product, chain
 import pandas as pd
 class grid_sweep:
    def __init__(self, jobtype, inpath, outpath, namer, *args):
        self.jobtype = jobtype
        self.namer = namer
        print(*args)
        grid = list(product(*args))
        inpath = Path(inpath)
        outpath = Path(outpath)
        self.hasrun = False
        self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
        self.jobs = [jobtype(*g) for g in self.grid]
    def run(self, cores=20):
        if cores is not None and cores > 1:
            with Pool(cores) as pool:
                infos = pool.map(self.jobtype.get_info, self.jobs)
        else:
            infos = map(self.jobtype.get_info, self.jobs)
        self.infos = pd.DataFrame(infos)
        self.hasrun = True
    def save(self, outcsv):
        if not self.hasrun:
            self.run()
        outcsv = Path(outcsv)
        outcsv.parent.mkdir(parents=True, exist_ok=True)
        self.infos.to_csv(outcsv)
 class twoway_grid_sweep(grid_sweep):
    def __init__(self, jobtype, inpath, outpath, namer, args1, args2, *args, **kwargs):
        self.jobtype = jobtype
        self.namer = namer
        prod1 = product(* args1.values())
        prod2 = product(* args2.values())
        grid1 = [dict(zip(args1.keys(), pargs)) for pargs in prod1]
        grid2 = [dict(zip(args2.keys(), pargs)) for pargs in prod2]
        grid = product(grid1, grid2)
        inpath = Path(inpath)
        outpath = Path(outpath)
        self.hasrun = False
        self.grid = [(inpath,outpath,namer(**(g[0] | g[1])), g[0], g[1], *args) for g in grid]
        self.jobs = [jobtype(*g) for g in self.grid]
--- a/clustering/hdbscan_clustering.py
+++ b/clustering/hdbscan_clustering.py
@ -0,0 +1,159 @@
 from clustering_base import clustering_result, clustering_job
 from grid_sweep import grid_sweep
 from dataclasses import dataclass
 import hdbscan
 from sklearn.neighbors import NearestNeighbors
 import plotnine as pn
 import numpy as np
 from itertools import product, starmap, chain
 import pandas as pd
 from multiprocessing import cpu_count
 import fire
 def test_select_hdbscan_clustering():
    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
    #                           "test_hdbscan_author30k",
    #                           min_cluster_sizes=[2],
    #                           min_samples=[1,2],
    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
    #                           cluster_selection_methods=['eom','leaf'],
    #                           lsi_dimensions='all')
    inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI"
    outpath = "test_hdbscan";
    min_cluster_sizes=[2,3,4];
    min_samples=[1,2,3];
    cluster_selection_epsilons=[0,0.1,0.3,0.5];
    cluster_selection_methods=[1];
    lsi_dimensions='all'
    gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
    gs.run(20)
    gs.save("test_hdbscan/lsi_sweep.csv")
    # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
    # job1.run()
    # print(job1.get_info())
    # df = pd.read_csv("test_hdbscan/selection_data.csv")
    # test_select_hdbscan_clustering()
    # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
    # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
    # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
 class hdbscan_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):
        super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)
    def namer(self,
              min_cluster_size,
              min_samples,
              cluster_selection_epsilon,
              cluster_selection_method):
        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
@dataclass
 class hdbscan_clustering_result(clustering_result):
    min_cluster_size:int
    min_samples:int
    cluster_selection_epsilon:float
    cluster_selection_method:str
 class hdbscan_job(clustering_job):
    def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
        super().__init__(infile,
                         outpath,
                         name,
                         call=hdbscan_job._hdbscan_clustering,
                         min_cluster_size=min_cluster_size,
                         min_samples=min_samples,
                         cluster_selection_epsilon=cluster_selection_epsilon,
                         cluster_selection_method=cluster_selection_method
                         )
        self.min_cluster_size = min_cluster_size
        self.min_samples = min_samples
        self.cluster_selection_epsilon = cluster_selection_epsilon
        self.cluster_selection_method = cluster_selection_method
 #        self.mat = 1 - self.mat
    def _hdbscan_clustering(mat, *args, **kwargs):
        print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
        print(mat)
        clusterer = hdbscan.HDBSCAN(metric='precomputed',
                                    core_dist_n_jobs=cpu_count(),
                                    *args,
                                    **kwargs,
                                    )
        clustering = clusterer.fit(mat.astype('double'))
        return(clustering)
    def get_info(self):
        result = super().get_info()
        self.result = hdbscan_clustering_result(**result.__dict__,
                                                min_cluster_size=self.min_cluster_size,
                                                min_samples=self.min_samples,
                                                cluster_selection_epsilon=self.cluster_selection_epsilon,
                                                cluster_selection_method=self.cluster_selection_method)
        return self.result
 def run_hdbscan_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
    """Run hdbscan clustering once or more with different parameters.
    Usage:
    hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    min_cluster_sizes: one or more integers indicating the minumum cluster size
    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
    cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
    cluster_selection_method: "eom" or "leaf" eom gives larger clusters. 
    """    
    obj = hdbscan_grid_sweep(inpath,
                             outpath,
                             map(int,min_cluster_sizes),
                             map(int,min_samples),
                             map(float,cluster_selection_epsilons),
                             cluster_selection_methods)
    obj.run()
    obj.save(savefile)
 def KNN_distances_plot(mat,outname,k=2):
    nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
    distances, indices = nbrs.kneighbors(mat)
    d2 = distances[:,-1]
    df = pd.DataFrame({'dist':d2})
    df = df.sort_values("dist",ascending=False)
    df['idx'] = np.arange(0,d2.shape[0]) + 1
    p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
                                                                                      breaks = np.arange(0,10)/10)
    p.save(outname,width=16,height=10)
 def make_KNN_plots():
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
 if __name__ == "__main__":
    fire.Fire(run_hdbscan_grid_sweep)
 #    test_select_hdbscan_clustering()
    #fire.Fire(select_hdbscan_clustering)  
--- a/clustering/hdbscan_clustering_lsi.py
+++ b/clustering/hdbscan_clustering_lsi.py
@ -0,0 +1,101 @@
 from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
 from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
 from grid_sweep import grid_sweep
 import fire
 from dataclasses import dataclass
@dataclass
 class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
    pass 
 class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(
                         infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        partial_result = super().get_info()
        self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
                                                    lsi_dimensions=self.lsi_dims)
        return self.result
 class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 min_cluster_sizes,
                 min_samples,
                 cluster_selection_epsilons,
                 cluster_selection_methods
                 ):
        super().__init__(hdbscan_lsi_job,
                         _hdbscan_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         min_cluster_sizes,
                         min_samples,
                         cluster_selection_epsilons,
                         cluster_selection_methods)
 class _hdbscan_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):
        print(args)
        print(kwargs)
        self.lsi_dim = lsi_dim
        self.jobtype = hdbscan_lsi_job
        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
    def namer(self, *args, **kwargs):
        s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=[1],lsi_dimensions='all'):
    """Run hdbscan clustering once or more with different parameters.
    Usage:
    hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit clusterings.
    min_cluster_sizes: one or more integers indicating the minumum cluster size
    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
    cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
    cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. 
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    """    
    obj = hdbscan_lsi_grid_sweep(inpath,
                                 lsi_dimensions,
                                 outpath,
                                 list(map(int,min_cluster_sizes)),
                                 list(map(int,min_samples)),
                                 list(map(float,cluster_selection_epsilons)),
                                 cluster_selection_methods)
    obj.run(10)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_hdbscan_lsi_grid_sweep)
--- a/clustering/kmeans_clustering.py
+++ b/clustering/kmeans_clustering.py
@ -0,0 +1,105 @@
 from sklearn.cluster import KMeans
 import fire
 from pathlib import Path
 from dataclasses import dataclass
 from clustering_base import clustering_result, clustering_job
 from grid_sweep import grid_sweep
@dataclass
 class kmeans_clustering_result(clustering_result):
    n_clusters:int
    n_init:int
    max_iter:int
 class kmeans_job(clustering_job):
    def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
        super().__init__(infile,
                         outpath,
                         name,
                         call=kmeans_job._kmeans_clustering,
                         n_clusters=n_clusters,
                         n_init=n_init,
                         max_iter=max_iter,
                         random_state=random_state,
                         verbose=verbose)
        self.n_clusters=n_clusters
        self.n_init=n_init
        self.max_iter=max_iter
    def _kmeans_clustering(mat, *args, **kwargs):
        clustering = KMeans(*args,
                            **kwargs,
                            ).fit(mat)
        return clustering
    def get_info(self):
        result = super().get_info()
        self.result = kmeans_clustering_result(**result.__dict__,
                                               n_init=self.n_init,
                                               max_iter=self.max_iter)
        return self.result
 class kmeans_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):
        super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs)
    def namer(self,
             n_clusters,
             n_init,
             max_iter):
        return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
 def test_select_kmeans_clustering():
    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
    outpath = "test_kmeans";
    n_clusters=[200,300,400];
    n_init=[1,2,3];
    max_iter=[100000]
    gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter)
    gs.run(1)
    cluster_selection_epsilons=[0,0.1,0.3,0.5];
    cluster_selection_methods=['eom'];
    lsi_dimensions='all'
    gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
    gs.run(20)
    gs.save("test_hdbscan/lsi_sweep.csv")
 def run_kmeans_grid_sweep(savefile, inpath, outpath,  n_clusters=[500], n_inits=[1], max_iters=[3000]):
    """Run kmeans clustering once or more with different parameters.
    Usage:
    kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    n_clusters: one or more numbers of kmeans clusters to select.
    n_inits: one or more numbers of different initializations to use for each clustering.
    max_iters: one or more numbers of different maximum interations. 
    """    
    obj = kmeans_grid_sweep(inpath,
                            outpath,
                            map(int,n_clusters),
                            map(int,n_inits),
                            map(int,max_iters))
    obj.run(1)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_kmeans_grid_sweep)
--- a/clustering/kmeans_clustering_lsi.py
+++ b/clustering/kmeans_clustering_lsi.py
@ -0,0 +1,93 @@
 import fire
 from dataclasses import dataclass
 from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep
 from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep
 from grid_sweep import grid_sweep
@dataclass
 class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
    pass
 class kmeans_lsi_job(kmeans_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        result = super().get_info()
        self.result = kmeans_clustering_result_lsi(**result.__dict__,
                                                   lsi_dimensions=self.lsi_dims)
        return self.result
 class _kmeans_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):
        print(args)
        print(kwargs)
        self.lsi_dim = lsi_dim
        self.jobtype = kmeans_lsi_job
        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
    def namer(self, *args, **kwargs):
        s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 class kmeans_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 n_clusters,
                 n_inits,
                 max_iters
                 ):
        super().__init__(kmeans_lsi_job,
                         _kmeans_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         n_clusters,
                         n_inits,
                         max_iters)
 def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath,  n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"):
    """Run kmeans clustering once or more with different parameters.
    Usage:
    kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    n_clusters: one or more numbers of kmeans clusters to select.
    n_inits: one or more numbers of different initializations to use for each clustering.
    max_iters: one or more numbers of different maximum interations. 
    """    
    obj = kmeans_lsi_grid_sweep(inpath,
                                lsi_dimensions,
                                outpath,
                                list(map(int,n_clusters)),
                                list(map(int,n_inits)),
                                list(map(int,max_iters))
                                )
    obj.run(1)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_kmeans_lsi_grid_sweep)
--- a/clustering/lsi_base.py
+++ b/clustering/lsi_base.py
@ -0,0 +1,44 @@
 from clustering_base import clustering_job, clustering_result
 from grid_sweep import grid_sweep, twoway_grid_sweep
 from dataclasses import dataclass
 from itertools import chain
 from pathlib import Path
 class lsi_mixin():
    def set_lsi_dims(self, lsi_dims):
        self.lsi_dims = lsi_dims
@dataclass
 class lsi_result_mixin:
    lsi_dimensions:int
 class lsi_grid_sweep(grid_sweep):
    def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
        self.jobtype = jobtype
        self.subsweep = subsweep
        inpath = Path(inpath)
        if lsi_dimensions == 'all':
            lsi_paths = list(inpath.glob("*.feather"))
        else:
            lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions]
        print(lsi_paths)
        lsi_nums = [int(p.stem) for p in lsi_paths]
        self.hasrun = False
        self.subgrids = [self.subsweep(lsi_path, outpath,  lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
        self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
 class twoway_lsi_grid_sweep(twoway_grid_sweep):
    def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, args1, args2):
        self.jobtype = jobtype
        self.subsweep = subsweep
        inpath = Path(inpath)
        if lsi_dimensions == 'all':
            lsi_paths = list(inpath.glob("*.feather"))
        else:
            lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions]
        lsi_nums = [int(p.stem) for p in lsi_paths]
        self.hasrun = False
        self.subgrids = [self.subsweep(lsi_path, outpath, lsi_dim, args1, args2) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
        self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
--- a/clustering/pick_best_clustering.py
+++ b/clustering/pick_best_clustering.py
@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 import fire
 import pandas as pd
 from pathlib import Path
 import shutil
 selection_data="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/clustering/comment_authors_compex_LSI/selection_data.csv"
 outpath = 'test_best.feather'
 min_clusters=50; max_isolates=7500; min_cluster_size=2
 # pick the best clustering according to silhouette score subject to contraints
 def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
    df = pd.read_csv(selection_data,index_col=0)
    df = df.sort_values("silhouette_score",ascending=False)
    # not sure I fixed the bug underlying this fully or not.
    df['n_isolates_str'] = df.n_isolates.str.strip("[]")
    df['n_isolates_0'] = df['n_isolates_str'].apply(lambda l: len(l) == 0)
    df.loc[df.n_isolates_0,'n_isolates'] = 0
    df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
    best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)]
    best_cluster = best_cluster.iloc[0]
    best_lsi_dimensions = best_cluster.lsi_dimensions
    print(best_cluster.to_dict())
    best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
    shutil.copy(best_path,output)
    print(f"lsi dimensions:{best_lsi_dimensions}")
 if __name__ == "__main__":
    fire.Fire(pick_best_clustering)
--- a/clustering/selection.py
+++ b/clustering/selection.py
@ -1,101 +1,38 @@
 from sklearn.metrics import silhouette_score
 from sklearn.cluster import AffinityPropagation
 from functools import partial
 from clustering import _affinity_clustering, read_similarity_mat
 from dataclasses import dataclass
 from multiprocessing  import Pool, cpu_count, Array, Process
 from pathlib import Path
 from itertools import product, starmap
 import numpy as np
 import pandas as pd
-import fire
+import plotnine as pn
-import sys
+from pathlib import Path
 from clustering.fit_tsne import fit_tsne
 from visualization.tsne_vis import build_visualization
-# silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
+df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
-@dataclass
+# plot silhouette_score as a function of isolates
-class clustering_result:
+df = df.sort_values("silhouette_score")
    outpath:Path
    damping:float
    max_iter:int
    convergence_iter:int
    preference_quantile:float
    silhouette_score:float
    alt_silhouette_score:float
    name:str
 df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
 p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
 p.save("isolates_x_score.png")
-def sim_to_dist(mat):
+p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
-    dist = 1-mat
+p.save("clusters_x_isolates.png")
    dist[dist < 0] = 0
    np.fill_diagonal(dist,0)
    return dist
-def do_clustering(damping, convergence_iter, preference_quantile, name, mat, subreddits,  max_iter,  outdir:Path, random_state, verbose, alt_mat, overwrite=False):
+# the best result for hdbscan seems like this one: it has a decent number of 
-    if name is None:
+# i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
-        name = f"damping-{damping}_convergenceIter-{convergence_iter}_preferenceQuantile-{preference_quantile}"
+best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
    print(name)
    sys.stdout.flush()
    outpath = outdir / (str(name) + ".feather")
    print(outpath)
    clustering = _affinity_clustering(mat, subreddits, outpath, damping, max_iter, convergence_iter, preference_quantile, random_state, verbose)
    mat = sim_to_dist(clustering.affinity_matrix_)
-    score = silhouette_score(mat, clustering.labels_, metric='precomputed')
+best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
-    if alt_mat is not None:
+tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
        alt_distances = sim_to_dist(alt_mat)
        alt_score = silhouette_score(alt_mat, clustering.labels_, metric='precomputed')
    res = clustering_result(outpath=outpath,
                            damping=damping,
                            max_iter=max_iter,
                            convergence_iter=convergence_iter,
                            preference_quantile=preference_quantile,
                            silhouette_score=score,
                            alt_silhouette_score=score,
                            name=str(name))
-    return res
+if not tnse_data.exists():
    fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
             tnse_data)
-# alt similiarities is for checking the silhouette coefficient of an alternative measure of similarity (e.g., topic similarities for user clustering).
+build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
                    Path(best_eom.outpath)/(best_eom['name']+'.feather'),
                    "./authors-tf_lsi850_best_eom.html")
-def select_affinity_clustering(similarities, outdir, outinfo, damping=[0.9], max_iter=100000, convergence_iter=[30], preference_quantile=[0.5], random_state=1968, verbose=True, alt_similarities=None, J=None):
+build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
                    Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
                    "./authors-tf_lsi850_best_leaf.html")
    damping = list(map(float,damping))
    convergence_iter = convergence_iter = list(map(int,convergence_iter))
    preference_quantile = list(map(float,preference_quantile))
    if type(outdir) is str:
        outdir = Path(outdir)
    outdir.mkdir(parents=True,exist_ok=True)
    subreddits, mat = read_similarity_mat(similarities,use_threads=True)
    if alt_similarities is not None:
        alt_mat = read_similarity_mat(alt_similarities,use_threads=True)
    else:
        alt_mat = None
    if J is None:
        J = cpu_count()
    pool = Pool(J)
    # get list of tuples: the combinations of hyperparameters
    hyper_grid = product(damping, convergence_iter, preference_quantile)
    hyper_grid = (t + (str(i),) for i, t in enumerate(hyper_grid))
    _do_clustering = partial(do_clustering,  mat=mat, subreddits=subreddits, outdir=outdir, max_iter=max_iter, random_state=random_state, verbose=verbose, alt_mat=alt_mat)
    #    similarities = Array('d', mat)
    # call pool.starmap
    print("running clustering selection")
    clustering_data = pool.starmap(_do_clustering, hyper_grid)
    clustering_data = pd.DataFrame(list(clustering_data))
    clustering_data.to_csv(outinfo)
    return clustering_data
 if __name__ == "__main__":
    x = fire.Fire(select_affinity_clustering)
--- a/clustering/validation.py
+++ b/clustering/validation.py
@ -0,0 +1,4 @@
 from sklearn import metrics
 from sklearn.cluster import AffinityPropagation
 from functools import partial
 # sillouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
--- a/datasets/Makefile
+++ b/datasets/Makefile
@ -0,0 +1,28 @@
 all: ../../data/reddit_comments_by_subreddit.parquet ../../data/reddit_submissions_by_subreddit.parquet
 ../../data/reddit_comments_by_subreddit.parquet:../../data/temp/reddit_comments.parquet
 	../start_spark_and_run.sh 4 comments_2_parquet_part2.py
 ../../data/temp/reddit_comments.parquet: comments_task_list.sh run_comments_jobs.sbatch
 	mkdir -p comments_jobs
 	mkdir -p ../../data/temp/
 	sbatch --wait --array=1-$(shell cat comments_task_list.sh | wc -l) run_comments_jobs.sbatch 0
 temp_reddit_comments.parquet: ../../data/temp/reddit_comments.parquet
 comments_task_list.sh: comments_2_parquet_part1.py
 	srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 bash -c "source ~/.bashrc && python3 comments_2_parquet_part1.py gen_task_list --overwrite=False"
 submissions_task_list.sh: submissions_2_parquet_part1.py
 	srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 submissions_2_parquet_part1.py gen_task_list
 ../../data/reddit_submissions_by_subreddit.parquet:../../data/temp/reddit_submissions.parquet
 	../start_spark_and_run.sh 4 submissions_2_parquet_part2.py
 ../../data/temp/reddit_submissions.parquet: submissions_task_list.sh run_submissions_jobs.sbatch
 	mkdir -p submissions_jobs
 	rm -rf ../../data/temp/reddit_submissions.parquet
 	mkdir -p ../../data/temp/
 	sbatch --wait --array=1-$(shell cat submissions_task_list.sh | wc -l) run_submissions_jobs.sbatch 0
 temp_reddit_submissions.parquet: ../../data/temp/reddit_submissions.parquet
--- a/datasets/checkpoint_parallelsql.sbatch
+++ b/datasets/checkpoint_parallelsql.sbatch
@ -1,26 +0,0 @@
 #!/bin/bash
 ## parallel_sql_job.sh
 #SBATCH --job-name=tf_subreddit_comments
 ## Allocation Definition
 #SBATCH --account=comdata-ckpt
 #SBATCH --partition=ckpt
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=12:00:00
 ## Memory per node
 #SBATCH --mem=32G
 #SBATCH --cpus-per-task=4
 #SBATCH --ntasks=1
 #SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
 source ./bin/activate
 module load parallel_sql
 echo $(which perl)
 conda list pyarrow
 which python3
 #Put here commands to load other modules (e.g. matlab etc.)
 #Below command means that parallel_sql will get tasks from the database
 #and run them on the node (in parallel). So a 16 core node will have
 #16 tasks running at one time.
 parallel-sql --sql -a parallel --exit-on-term --jobs 4
--- a/datasets/comments_2_parquet.sh
+++ b/datasets/comments_2_parquet.sh
@ -1,10 +1,10 @@
 #!/usr/bin/env bash
 ## needs to be run by hand since i don't have a nice way of waiting on a parallel-sql job to complete 
 #!/usr/bin/env bash
 echo "#!/usr/bin/bash" > job_script.sh
 #echo "source $(pwd)/../bin/activate" >> job_script.sh
 echo "python3 $(pwd)/comments_2_parquet_part1.py" >> job_script.sh
-srun -p comdata -A comdata --nodes=1 --mem=120G --time=48:00:00 --pty job_script.sh
+srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 --pty job_script.sh
 start_spark_and_run.sh 1 $(pwd)/comments_2_parquet_part2.py
--- a/datasets/comments_2_parquet_part1.py
+++ b/datasets/comments_2_parquet_part1.py
@ -1,12 +1,15 @@
 #!/usr/bin/env python3
 import os
 import json
 from datetime import datetime
 from multiprocessing import Pool
 from itertools import islice
-from helper import find_dumps, open_fileset
+from helper import open_input_file, find_dumps
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
 from pathlib import Path
 import fire
 def parse_comment(comment, names= None):
    if names is None:
@ -44,72 +47,65 @@ def parse_comment(comment, names= None):
    return tuple(row)
-#    conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
+#    conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','../../data/spark_tmp')])
-dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments/"
+def parse_dump(partition):
-files = list(find_dumps(dumpdir, base_pattern="RC_20*"))
+    dumpdir = f"../../data/reddit_dumps/comments/{partition}"
-pool = Pool(28)
+    stream = open_input_file(dumpdir)
    rows = map(parse_comment, stream)
-stream = open_fileset(files)
+    schema = pa.schema([
        pa.field('id', pa.string(), nullable=True),
        pa.field('subreddit', pa.string(), nullable=True),
        pa.field('link_id', pa.string(), nullable=True),
        pa.field('parent_id', pa.string(), nullable=True),
        pa.field('created_utc', pa.timestamp('ms'), nullable=True),
        pa.field('author', pa.string(), nullable=True),
        pa.field('ups', pa.int64(), nullable=True),
        pa.field('downs', pa.int64(), nullable=True),
        pa.field('score', pa.int64(), nullable=True),
        pa.field('edited', pa.bool_(), nullable=True),
        pa.field('time_edited', pa.timestamp('ms'), nullable=True),
        pa.field('subreddit_type', pa.string(), nullable=True),
        pa.field('subreddit_id', pa.string(), nullable=True),
        pa.field('stickied', pa.bool_(), nullable=True),
        pa.field('is_submitter', pa.bool_(), nullable=True),
        pa.field('body', pa.string(), nullable=True),
        pa.field('error', pa.string(), nullable=True),
    ])
-N = int(1e4)
+    p = Path("../../data/temp/reddit_comments.parquet")
    p.mkdir(exist_ok=True,parents=True)
-rows = pool.imap_unordered(parse_comment, stream, chunksize=int(N/28))
+    N=10000
    with pq.ParquetWriter(f"../../data/temp/reddit_comments.parquet/{partition}.parquet",
                          schema=schema,
                          compression='snappy',
                          flavor='spark') as writer:
-schema = pa.schema([
+        while True:
-    pa.field('id', pa.string(), nullable=True),
+            chunk = islice(rows,N)
-    pa.field('subreddit', pa.string(), nullable=True),
+            pddf = pd.DataFrame(chunk, columns=schema.names)
-    pa.field('link_id', pa.string(), nullable=True),
+            table = pa.Table.from_pandas(pddf,schema=schema)
-    pa.field('parent_id', pa.string(), nullable=True),
+            if table.shape[0] == 0:
-    pa.field('created_utc', pa.timestamp('ms'), nullable=True),
+                break
-    pa.field('author', pa.string(), nullable=True),
+            writer.write_table(table)
    pa.field('ups', pa.int64(), nullable=True),
    pa.field('downs', pa.int64(), nullable=True),
    pa.field('score', pa.int64(), nullable=True),
    pa.field('edited', pa.bool_(), nullable=True),
    pa.field('time_edited', pa.timestamp('ms'), nullable=True),
    pa.field('subreddit_type', pa.string(), nullable=True),
    pa.field('subreddit_id', pa.string(), nullable=True),
    pa.field('stickied', pa.bool_(), nullable=True),
    pa.field('is_submitter', pa.bool_(), nullable=True),
    pa.field('body', pa.string(), nullable=True),
    pa.field('error', pa.string(), nullable=True),
 ])
-from pathlib import Path
+        writer.close()
 p = Path("/gscratch/comdata/output/reddit_comments.parquet_temp2")
 if not p.is_dir():
    if p.exists():
        p.unlink()
    p.mkdir()
 else:
    list(map(Path.unlink,p.glob('*')))
 part_size = int(1e7)
 part = 1
 n_output = 0
 writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark')
 while True:
    if n_output > part_size:
        if part > 1:
            writer.close()
        part = part + 1
        n_output = 0
        writer = pq.ParquetWriter(f"/gscratch/comdata/output/reddit_comments.parquet_temp2/part_{part}.parquet",schema=schema,compression='snappy',flavor='spark')
    n_output += N
    chunk = islice(rows,N)
    pddf = pd.DataFrame(chunk, columns=schema.names)
    table = pa.Table.from_pandas(pddf,schema=schema)
    if table.shape[0] == 0:
        break
    writer.write_table(table)
 def gen_task_list(dumpdir="../../data/raw_data/reddit_dumps/comments", overwrite=True):
    files = list(find_dumps(dumpdir,base_pattern="RC_20*.*"))
    with open("comments_task_list.sh",'w') as of:
        for fpath in files:
            partition = os.path.split(fpath)[1]
            if (not Path(f"../../data/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
                of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n')
 if __name__ == '__main__':
    fire.Fire({'parse_dump':parse_dump,
              'gen_task_list':gen_task_list})
--- a/datasets/comments_2_parquet_part2.py
+++ b/datasets/comments_2_parquet_part2.py
@ -2,12 +2,19 @@
 # spark script to make sorted, and partitioned parquet files 
 import pyspark
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 spark = SparkSession.builder.getOrCreate()
-df = spark.read.parquet("/gscratch/comdata/output/reddit_comments.parquet_temp2",compression='snappy')
+conf = pyspark.SparkConf().setAppName("Reddit submissions to parquet")
 conf = conf.set("spark.sql.shuffle.partitions",2400)
 conf = conf.set('spark.sql.crossJoin.enabled',"true")
 conf = conf.set('spark.debug.maxToStringFields',200)
 sc = spark.sparkContext
 df = spark.read.parquet("/gscratch/comdata/output/temp/reddit_comments.parquet",compression='snappy')
 df = df.withColumn("subreddit_2", f.lower(f.col('subreddit')))
 df = df.drop('subreddit')
@ -18,12 +25,13 @@ df = df.withColumn("Month",f.month(f.col("CreatedAt")))
 df = df.withColumn("Year",f.year(f.col("CreatedAt")))
 df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt")))
-df = df.repartition('subreddit')
+# df = df.repartition(1200,'subreddit')
-df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
+# df2 = df.sort(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
-df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
+# df2 = df2.sortWithinPartitions(["subreddit","CreatedAt","link_id","parent_id","Year","Month","Day"],ascending=True)
-df2.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_subreddit.parquet_new", mode='overwrite', compression='snappy')
+# df2.write.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_subreddit.parquet", mode='overwrite', compression='snappy')
-df = df.repartition('author')
+#df = spark.read.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_subreddit.parquet")
-df3 = df.sort(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True)
+df = df.repartition(2400,'author','subreddit',"Year","Month","Day")
-df3 = df3.sortWithinPartitions(["author","CreatedAt","subreddit","link_id","parent_id","Year","Month","Day"],ascending=True)
+df3 = df.sort(["author","subreddit","Year","Month","Day","CreatedAt","link_id","parent_id"],ascending=True)
-df3.write.parquet("/gscratch/comdata/users/nathante/reddit_comments_by_author.parquet_new", mode='overwrite',compression='snappy')
+df3 = df3.sortWithinPartitions(["author","subreddit","Year","Month","Day","CreatedAt","link_id","parent_id"],ascending=True)
 df3.write.parquet("/gscratch/scrubbed/comdata/reddit_comments_by_author.parquet", mode='overwrite',compression='snappy')
--- a/datasets/helper.py
+++ b/datasets/helper.py
@ -24,8 +24,7 @@ def open_fileset(files):
    for fh in files:
        print(fh)
        lines = open_input_file(fh)
-        for line in lines:
+        yield from lines
            yield line
 def open_input_file(input_filename):
    if re.match(r'.*\.7z$', input_filename):
@ -39,7 +38,7 @@ def open_input_file(input_filename):
    elif re.match(r'.*\.xz', input_filename):
        cmd = ["xzcat",'-dk', '-T 20',input_filename]
    elif re.match(r'.*\.zst',input_filename):
-        cmd = ['zstd','-dck', input_filename]
+        cmd = ['/kloneusr/bin/zstd','-dck', input_filename,  '--memory=2048MB --stdout']
    elif re.match(r'.*\.gz',input_filename):
        cmd = ['gzip','-dc', input_filename]
    try:
--- a/datasets/job_script.sh
+++ b/datasets/job_script.sh
@ -1,4 +0,0 @@
 #!/usr/bin/bash
 start_spark_cluster.sh
 spark-submit --master spark://$(hostname):18899 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/users/nathante/subreddit_term_similarity_weekly_5000.parquet --topN=5000
 stop-all.sh
--- a/datasets/run_comments_jobs.sbatch
+++ b/datasets/run_comments_jobs.sbatch
@ -0,0 +1,24 @@
 #!/bin/bash
 ## tf reddit comments
 #SBATCH --job-name="cdsc_reddit; parse comment dumps"
 ## Allocation Definition
 #SBATCH --account=comdata
 #SBATCH --partition=compute-bigmem
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=24:00:00
 ## Memory per node
 #SBATCH --mem=8G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks=1
 #SBATCH 
 #SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/datasets
 #SBATCH --output=comments_jobs/%A_%a.out
 #SBATCH --error=comments_jobs/%A_%a.out
 . /opt/ohpc/admin/lmod/lmod/init/profile
 source ~/.bashrc
 TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p ./comments_task_list.sh)
 ${TASK_CALL}
--- a/datasets/run_submissions_jobs.sbatch
+++ b/datasets/run_submissions_jobs.sbatch
@ -0,0 +1,23 @@
 #!/bin/bash
 ## tf reddit comments
 #SBATCH --job-name="cdsc_reddit; parse submission dumps"
 ## Allocation Definition
 #SBATCH --account=comdata-ckpt
 #SBATCH --partition=ckpt
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=24:00:00
 ## Memory per node
 #SBATCH --mem=8G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks=1
 #SBATCH 
 #SBATCH --chdir /gscratch/comdata/users/nathante/cdsc_reddit/datasets
 #SBATCH --output=submissions_jobs/%A_%a.out
 #SBATCH --error=submissions_jobs/%A_%a.out
 TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p ./submissions_task_list.sh)
 ${TASK_CALL}
--- a/datasets/submissions_2_parquet.sh
+++ b/datasets/submissions_2_parquet.sh
@ -1,8 +1,8 @@
 #!/usr/bin/env bash
 ## this should be run manually since we don't have a nice way to wait on parallel_sql jobs
 #!/usr/bin/env bash
-./parse_submissions.sh
+srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 $(pwd)/submissions_2_parquet_part1.py gen_task_list
 start_spark_and_run.sh 1 $(pwd)/submissions_2_parquet_part2.py
--- a/datasets/submissions_2_parquet_part1.py
+++ b/datasets/submissions_2_parquet_part1.py
@ -3,26 +3,23 @@
 # two stages:
 # 1. from gz to arrow parquet (this script) 
 # 2. from arrow parquet to spark parquet (submissions_2_parquet_part2.py)
 from datetime import datetime
-from multiprocessing import Pool
+from pathlib import Path
 from itertools import islice
 from helper import find_dumps, open_fileset
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
 import simdjson
 import fire
 import os
-
+import json
 parser = simdjson.Parser()
 def parse_submission(post, names = None):
    if names is None:
        names = ['id','author','subreddit','title','created_utc','permalink','url','domain','score','ups','downs','over_18','has_media','selftext','retrieved_on','num_comments','gilded','edited','time_edited','subreddit_type','subreddit_id','subreddit_subscribers','name','is_self','stickied','quarantine','error']
    try:
-        post = parser.parse(post)
+        post = json.loads(post)
    except (ValueError) as e:
        #        print(e)
        #        print(post)
@ -61,7 +58,7 @@ def parse_submission(post, names = None):
 def parse_dump(partition):
    N=10000
-    stream = open_fileset([f"/gscratch/comdata/raw_data/reddit_dumps/submissions/{partition}"])
+    stream = open_fileset([f"/gscratch/comdata/raw_data/submissions/{partition}"])
    rows = map(parse_submission,stream)
    schema = pa.schema([
        pa.field('id', pa.string(),nullable=True),
@ -92,8 +89,7 @@ def parse_dump(partition):
        pa.field('quarantine',pa.bool_(),nullable=True),
        pa.field('error',pa.string(),nullable=True)])
-    if not os.path.exists("/gscratch/comdata/output/temp/reddit_submissions.parquet/"):
+    Path("/gscratch/comdata/output/temp/reddit_submissions.parquet/").mkdir(exist_ok=True,parents=True)
        os.mkdir("/gscratch/comdata/output/temp/reddit_submissions.parquet/")
    with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_submissions.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer:
        while True:
@ -106,9 +102,9 @@ def parse_dump(partition):
        writer.close()
-def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/submissions"):
+def gen_task_list(dumpdir="/gscratch/comdata/raw_data/submissions"):
    files = list(find_dumps(dumpdir,base_pattern="RS_20*.*"))
-    with open("parse_submissions_task_list",'w') as of:
+    with open("submissions_task_list.sh",'w') as of:
        for fpath in files:
            partition = os.path.split(fpath)[1]
            of.write(f'python3 submissions_2_parquet_part1.py parse_dump {partition}\n')
--- a/datasets/submissions_2_parquet_part2.py
+++ b/datasets/submissions_2_parquet_part2.py
@ -29,14 +29,14 @@ df = df.withColumn("Day",f.dayofmonth(f.col("CreatedAt")))
 df = df.withColumn("subreddit_hash",f.sha2(f.col("subreddit"), 256)[0:3])
 # next we gotta resort it all.
-df = df.repartition("subreddit")
+df = df.repartition(800,"subreddit","Year","Month")
-df2 = df.sort(["subreddit","CreatedAt","id"],ascending=True)
+df2 = df.sort(["subreddit","Year","Month","CreatedAt","id"],ascending=True)
 df2 = df.sortWithinPartitions(["subreddit","CreatedAt","id"],ascending=True)
 df2.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_subreddit.parquet2", mode='overwrite',compression='snappy')
 # # we also want to have parquet files sorted by author then reddit. 
-df = df.repartition("author")
+df = df.repartition(800,"author","subreddit","Year","Month")
-df3 = df.sort(["author","CreatedAt","id"],ascending=True)
+df3 = df.sort(["author","Year","Month","CreatedAt","id"],ascending=True)
 df3 = df.sortWithinPartitions(["author","CreatedAt","id"],ascending=True)
 df3.write.parquet("/gscratch/comdata/output/temp/reddit_submissions_by_author.parquet2", mode='overwrite',compression='snappy')
--- a/density/Makefile
+++ b/density/Makefile
@ -1,10 +1,7 @@
-all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather
+all: ../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather
-/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
+../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py ../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
-	start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum
+	../start_spark_and_run.sh 1 overlap_density.py authors --inpath="../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
-/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
+../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather:
-	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum
+	$(MAKE) -C ../similarities
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
--- a/density/job_script.sh
+++ b/density/job_script.sh
@ -1,4 +1,6 @@
 #!/usr/bin/bash
 source ~/.bashrc
 echo $(hostname)
 start_spark_cluster.sh
-spark-submit --master spark://$(hostname):18899 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --outpath=/gscratch/comdata/output/reddit_density/comment_authors_10000.feather --agg=pd.DataFrame.sum
+spark-submit --verbose --master spark://$(hostname):43015 overlap_density.py authors --inpath=../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
 stop-all.sh
--- a/density/overlap_density.py
+++ b/density/overlap_density.py
@ -1,11 +1,12 @@
 import pandas as pd
 from pandas.core.groupby import DataFrameGroupBy as GroupBy
 from pathlib import Path
 import fire
 import numpy as np
 import sys
-sys.path.append("..")
+# sys.path.append("..")
-sys.path.append("../similarities")
+# sys.path.append("../similarities")
-from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_interval
+# from similarities.similarities_helper import pull_tfidf
 # this is the mean of the ratio of the overlap to the focal size.
 # mean shared membership per focal community member
@ -13,10 +14,12 @@ from similarities.similarities_helper import reindex_tfidf, reindex_tfidf_time_i
 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
    df = pd.read_feather(inpath)
-    df = df.drop('subreddit',1)
+    df = df.drop('_subreddit',1)
    np.fill_diagonal(df.values,0)
    df = agg(df, 0).reset_index()
    df = df.rename({0:'overlap_density'},axis='columns')
    outpath = Path(outpath)
    outpath.parent.mkdir(parents=True, exist_ok = True)
    df.to_feather(outpath)
    return df
@ -25,6 +28,8 @@ def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
    # exclude the diagonal
    df = df.loc[df.subreddit != df.variable]
    res = agg(df.groupby(['subreddit','week'])).reset_index()
    outpath = Path(outpath)
    outpath.parent.mkdir(parents=True, exist_ok = True)
    res.to_feather(outpath)
    return res
--- a/dumps/check_comments_shas.py
+++ b/dumps/check_comments_shas.py
@ -6,9 +6,9 @@ from os import path
 import hashlib
 shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text
-shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text
+#shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text
-shasums = shasums1 + shasums2
+shasums = shasums1 
 dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments"
 for l in shasums.strip().split('\n'):
--- a/dumps/pull_pushshift_comments.sh
+++ b/dumps/pull_pushshift_comments.sh
@ -1,12 +1,12 @@
 #!/bin/bash
-user_agent='nathante teblunthuis <nathante@uw.edu>'
+user_agent='"nathante teblunthuis <nathante@uw.edu>"'
 output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments'
 base_url='https://files.pushshift.io/reddit/comments/'
-wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
+wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
-wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
+wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
-wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
+wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
 ./check_comments_shas.py
--- a/dumps/pull_pushshift_submissions.sh
+++ b/dumps/pull_pushshift_submissions.sh
@ -1,14 +1,14 @@
 #!/bin/bash
-user_agent='nathante teblunthuis <nathante@uw.edu>'
+user_agent='"nathante teblunthuis <nathante@uw.edu>"'
 output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions'
 base_url='https://files.pushshift.io/reddit/submissions/'
-wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
+wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
-wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
+wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
-wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
+wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
-wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
+wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
-wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
+wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
-wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
+wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 ./check_submission_shas.py
--- a/dumps/remove_duplicate_comments.py
+++ b/dumps/remove_duplicate_comments.py
@ -0,0 +1,34 @@
 from pathlib import Path
 from itertools import chain, groupby
 dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments")
 zst_files = dumpdir.glob("*.zst")
 bz2_files = dumpdir.glob("*.bz2")
 xz_files = dumpdir.glob("*.xz")
 all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
 groups = groupby(all_files, key = lambda p: p.stem)
 kept_paths = []
 removed_paths = []
 priority = ['.zst','.xz','.bz2']
 for stem, files in groups:
    keep_file = None
    remove_files = []
    for f in files:
        if keep_file is None:
            keep_file = f
        elif priority.index(keep_file.suffix) > priority.index(f.suffix):
            remove_files.append(keep_file)
            keep_file = f
        else:
            remove_files.append(f)
    kept_paths.append(keep_file)
    removed_paths.extend(remove_files)
 (dumpdir / "to_remove").mkdir()
 for f in removed_paths:
    f.rename(f.parent / "to_remove" / f.name)
--- a/dumps/remove_duplicate_submissions.py
+++ b/dumps/remove_duplicate_submissions.py
@ -0,0 +1,34 @@
 from pathlib import Path
 from itertools import chain, groupby
 dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/submissions")
 zst_files = dumpdir.glob("*.zst")
 bz2_files = dumpdir.glob("*.bz2")
 xz_files = dumpdir.glob("*.xz")
 all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
 groups = groupby(all_files, key = lambda p: p.stem)
 kept_paths = []
 removed_paths = []
 priority = ['.zst','.xz','.bz2']
 for stem, files in groups:
    keep_file = None
    remove_files = []
    for f in files:
        if keep_file is None:
            keep_file = f
        elif priority.index(keep_file.suffix) > priority.index(f.suffix):
            remove_files.append(keep_file)
            keep_file = f
        else:
            remove_files.append(f)
    kept_paths.append(keep_file)
    removed_paths.extend(remove_files)
 (dumpdir / "to_remove").mkdir()
 for f in removed_paths:
    f.rename(f.parent / "to_remove" / f.name)
--- a/examples/pyarrow_reading.py
+++ b/examples/pyarrow_reading.py
@ -1,17 +0,0 @@
 import pyarrow.dataset as ds
 # A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory. 
 #dataset = ds.dataset(pathlib.Path('/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet/'), format='parquet', partitioning='hive')
 dataset = ds.dataset('/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/', format='parquet')
 # let's get all the comments to two subreddits:
 subreddits_to_pull = ['seattle','seattlewa']
 # a table is a low-level structured data format.  This line pulls data into memory. Setting metadata_n_threads > 1 gives a little speed boost.
 table = dataset.to_table(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext'])
 # Since data from just these 2 subreddits fits in memory we can just turn our table into a pandas dataframe.
 df = table.to_pandas()
 # We should save this smaller dataset so we don't have to wait 15 min to pull from parquet next time.
 df.to_csv("mydataset.csv")
--- a/examples/pyarrow_streaming.py
+++ b/examples/pyarrow_streaming.py
@ -1,38 +0,0 @@
 import pyarrow.dataset as ds
 from itertools import groupby
 # A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory. 
 dataset = ds.dataset('/gscratch/comdata/output/reddit_submissions_by_author.parquet', format='parquet')
 # let's get all the comments to two subreddits:
 subreddits_to_pull = ['seattlewa','seattle']
 # instead of loading the data into a pandas dataframe all at once we can stream it.
 scan_tasks = dataset.scan(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext'])
 # simple function to execute scantasks and generate rows
 def iterate_rows(scan_tasks):
    for st in scan_tasks:
        for rb in st.execute():
            df = rb.to_pandas()
            for t in df.itertuples():
                yield t
 row_iter = iterate_rows(scan_tasks)
 # now we can use python's groupby function to read one author at a time
 # note that the same author can appear more than once since the record batches may not be in the correct order.
 author_submissions = groupby(row_iter, lambda row: row.author)
 count_dict = {}
 for auth, posts in author_submissions:
    if auth in count_dict:
        count_dict[auth] = count_dict[auth] + 1
    else:
        count_dict[auth] = 1
 # since it's partitioned and sorted by author, we get one group for each author 
 any([ v != 1 for k,v in count_dict.items()])
--- a/ngrams/#ngrams_helper.py#
+++ b/ngrams/#ngrams_helper.py#
--- a/ngrams/Makefile
+++ b/ngrams/Makefile
@ -0,0 +1,25 @@
 outputdir=../../data/reddit_ngrams/
 inputdir=../../data/reddit_comments_by_subreddit.parquet
 authors_tfdir=${outputdir}/comment_authors.parquet
 srun=sbatch --wait --verbose run_job.sbatch
 all: ${outputdir}/comment_authors_sorted.parquet/_SUCCESS 
 tf_task_list_1: tf_comments.py
 	${srun} bash -c "python3 tf_comments.py gen_task_list --mwe_pass='first' --outputdir=${outputdir} --tf_task_list=$@ --inputdir=${inputdir}"
 ${outputdir}/comment_terms.parquet:tf_task_list_1
 	mkdir -p sbatch_log
 	sbatch --wait --verbose --array=1-$(shell cat $< | wc -l) run_array.sbatch 0 $<
 ${outputdir}/comment_authors.parquet:${outputdir}/comment_terms.parquet
 	-
 ${outputdir}/comment_authors_sorted.parquet:${outputdir}/comment_authors.parquet sort_tf_comments.py
 	../start_spark_and_run.sh 3 sort_tf_comments.py --inparquet=$< --outparquet=$@ --colname=author
 ${outputdir}/comment_authors_sorted.parquet/_SUCCESS:${outputdir}/comment_authors_sorted.parquet
 ${inputdir}:
 	$(MAKE) -C ../datasets
--- a/ngrams/run_array.sbatch
+++ b/ngrams/run_array.sbatch
@ -0,0 +1,19 @@
 #!/bin/bash
 #SBATCH --job-name=reddit_comment_term_frequencies
 #SBATCH --account=comdata
 #SBATCH --partition=compute-bigmem
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=1
 #SBATCH --mem-per-cpu=9g
 #SBATCH --ntasks=1
 #SBATCH --export=ALL
 #SBATCH --time=48:00:00
 #SBATCH --chdir=/gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams
 #SBATCH --error="sbatch_log/%A_%a.out"
 #SBATCH --output="sbatch_log/%A_%a.out"
 TASK_NUM=$(($SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p $2)
 ${TASK_CALL}
--- a/ngrams/run_job.sbatch
+++ b/ngrams/run_job.sbatch
@ -0,0 +1,18 @@
 #!/bin/bash
 #SBATCH --job-name="simulate measurement error models"
 ## Allocation Definition
 #SBATCH --account=comdata
 #SBATCH --partition=compute-bigmem
 ## Resources
 #SBATCH --nodes=1    
 ## Walltime (4 hours)
 #SBATCH --time=4:00:00
 ## Memory per node
 #SBATCH --mem=4G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams/
 #SBATCH --output=sbatch_log/%A_%a.out
 #SBATCH --error=sbatch_log/%A_%a.err
 echo "$@"
 "$@"
--- a/ngrams/run_tf_jobs.sh
+++ b/ngrams/run_tf_jobs.sh
@ -1,8 +1,6 @@
 #!/usr/bin/env bash
-module load parallel_sql
+
 source ./bin/activate
 python3 tf_comments.py gen_task_list
 psu --del --Y
 cat tf_task_list | psu --load
 for job in $(seq 1 50); do sbatch checkpoint_parallelsql.sbatch; done;
--- a/ngrams/sort_tf_comments.py
+++ b/ngrams/sort_tf_comments.py
@ -2,12 +2,17 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 import fire
-spark = SparkSession.builder.getOrCreate()
+def main(inparquet, outparquet, colname):
-df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/")
+    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(inparquet)
-df = df.repartition(2000,'term')
+    df = df.repartition(2000,colname)
-df = df.sort(['term','week','subreddit'])
+    df = df.sort([colname,'week','subreddit'])
-df = df.sortWithinPartitions(['term','week','subreddit'])
+    df = df.sortWithinPartitions([colname,'week','subreddit'])
-df.write.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_sorted_tf.parquet_temp",mode='overwrite',compression='snappy')
+    df.write.parquet(outparquet,mode='overwrite',compression='snappy')
 if __name__ == '__main__':
    fire.Fire(main)
--- a/ngrams/tf_comments.py
+++ b/ngrams/tf_comments.py
@ -0,0 +1,211 @@
 #!/usr/bin/env python3
 import pandas as pd
 import pyarrow as pa
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 import pyarrow.compute as pc
 from itertools import groupby, islice, chain
 import fire
 from collections import Counter
 import os
 import re
 from nltk import wordpunct_tokenize, MWETokenizer, sent_tokenize
 from nltk.corpus import stopwords
 from nltk.util import ngrams
 import string
 from random import random
 from redditcleaner import clean
 from pathlib import Path
 from datetime import datetime
 # compute term frequencies for comments in each subreddit by week
 def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None):
    dataset = ds.dataset(Path(inputdir)/partition, format='parquet')
    outputdir = Path(outputdir)
    samppath = outputdir / "reddit_comment_ngrams_10p_sample"
    if not samppath.exists():
        samppath.mkdir(parents=True, exist_ok=True)
    ngram_output = partition.replace("parquet","txt")
    if excluded_users is not None:
        excluded_users = set(map(str.strip,open(excluded_users)))
        df = df.filter(~ (f.col("author").isin(excluded_users)))
    ngram_path = samppath / ngram_output
    if mwe_pass == 'first':
        if ngram_path.exists():
            ngram_path.unlink()
    dataset = dataset.filter(pc.field("CreatedAt") <= pa.scalar(datetime(2020,4,13)))
    batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
    schema = pa.schema([pa.field('subreddit', pa.string(), nullable=False),
                        pa.field('term', pa.string(), nullable=False),
                        pa.field('week', pa.date32(), nullable=False),
                        pa.field('tf', pa.int64(), nullable=False)]
    )
    author_schema = pa.schema([pa.field('subreddit', pa.string(), nullable=False),
                               pa.field('author', pa.string(), nullable=False),
                               pa.field('week', pa.date32(), nullable=False),
                               pa.field('tf', pa.int64(), nullable=False)]
    )
    dfs = (b.to_pandas() for b in batches)
    def add_week(df):
        df['week'] = (df.CreatedAt - pd.to_timedelta(df.CreatedAt.dt.dayofweek, unit='d')).dt.date
        return(df)
    dfs = (add_week(df) for df in dfs)
    def iterate_rows(dfs):
        for df in dfs:
            for row in df.itertuples():
                yield row
    rows = iterate_rows(dfs)
    subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
    mwe_path = outputdir / "multiword_expressions.feather"
    if mwe_pass != 'first':
        mwe_dataset = pd.read_feather(mwe_path)
        mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
        mwe_phrases = list(mwe_dataset.phrase)
        mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
        mwe_tokenizer = MWETokenizer(mwe_phrases)
        mwe_tokenize = mwe_tokenizer.tokenize
    else:
        mwe_tokenize = MWETokenizer().tokenize
    def remove_punct(sentence):
        new_sentence = []
        for token in sentence:
            new_token = ''
            for c in token:
                if c not in string.punctuation:
                    new_token += c
            if len(new_token) > 0:
                new_sentence.append(new_token)
        return new_sentence
    stopWords = set(stopwords.words('english'))
    # we follow the approach described in datta, phelan, adar 2017
    def my_tokenizer(text):
        # remove stopwords, punctuation, urls, lower case
        # lowercase        
        text = text.lower()
        # redditcleaner removes reddit markdown(newlines, quotes, bullet points, links, strikethrough, spoiler, code, superscript, table, headings)
        text = clean(text)
        # sentence tokenize
        sentences = sent_tokenize(text)
        # wordpunct_tokenize
        sentences = map(wordpunct_tokenize, sentences)
        # remove punctuation
        sentences = map(remove_punct, sentences)
        # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
        # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
        # here we take a 10 percent sample of sentences 
        if mwe_pass == 'first':
            # remove sentences with less than 2 words
            sentences = filter(lambda sentence: len(sentence) > 2, sentences)
            sentences = list(sentences)
            for sentence in sentences:
                if random() <= 0.1:
                    grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
                    with open(ngram_path,'a') as gram_file:
                        for ng in grams:
                            gram_file.write(' '.join(ng) + '\n')
                for token in sentence:
                    if token not in stopWords:
                        yield token
        else:
            # remove stopWords
            sentences = map(mwe_tokenize, sentences)
            sentences = map(lambda s: filter(lambda token: token not in stopWords, s), sentences)
            for sentence in sentences:
                for token in sentence:
                    yield token
    def tf_comments(subreddit_weeks):
        for key, posts in subreddit_weeks:
            subreddit, week = key
            tfs = Counter([])
            authors = Counter([])
            for post in posts:
                tokens = my_tokenizer(post.body)
                tfs.update(tokens)
                authors.update([post.author])
            for term, tf in tfs.items():
                yield [True, subreddit, term, week, tf]
            for author, tf in authors.items():
                yield [False, subreddit, author, week, tf]
    outrows = tf_comments(subreddit_weeks)
    outchunksize = 10000
    termtf_outputdir = (outputdir / "comment_terms.parquet")
    termtf_outputdir.mkdir(parents=True, exist_ok=True)
    authortf_outputdir = (outputdir / "comment_authors.parquet")
    authortf_outputdir.mkdir(parents=True, exist_ok=True)    
    termtf_path = termtf_outputdir / partition
    authortf_path = authortf_outputdir / partition
    with pq.ParquetWriter(termtf_path, schema=schema, compression='snappy', flavor='spark') as writer, \
         pq.ParquetWriter(authortf_path, schema=author_schema, compression='snappy', flavor='spark') as author_writer:
        while True:
            chunk = islice(outrows,outchunksize)
            chunk = (c for c in chunk if c[1] is not None)
            pddf = pd.DataFrame(chunk, columns=["is_token"] + schema.names)
            author_pddf = pddf.loc[pddf.is_token == False, schema.names]
            pddf = pddf.loc[pddf.is_token == True, schema.names]
            author_pddf = author_pddf.rename({'term':'author'}, axis='columns')
            author_pddf = author_pddf.loc[:,author_schema.names]
            table = pa.Table.from_pandas(pddf,schema=schema)
            author_table = pa.Table.from_pandas(author_pddf,schema=author_schema)
            do_break = True
            if table.shape[0] != 0:
                writer.write_table(table)
                do_break = False
            if author_table.shape[0] != 0:
                author_writer.write_table(author_table)
                do_break = False
            if do_break:
                break
        writer.close()
        author_writer.close()
 def gen_task_list(mwe_pass='first', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None):
    files = os.listdir(inputdir)
    with open(tf_task_list,'w') as outfile:
        for f in files:
            if f.endswith(".parquet"):
                outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --inputdir {inputdir} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n")
 if __name__ == "__main__":
    fire.Fire({"gen_task_list":gen_task_list,
               "weekly_tf":weekly_tf})
--- a/run_array.sbatch
+++ b/run_array.sbatch
@ -0,0 +1,22 @@
 #!/bin/bash
 ## tf reddit comments
 #SBATCH --job-name="wikia ecology; fit var models"
 ## Allocation Definition
 #SBATCH --account=comdata-ckpt
 #SBATCH --partition=ckpt
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=24:00:00
 ## Memory per node
 #SBATCH --mem=8G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks=1
 #SBATCH 
 #SBATCH --chdir /gscratch/comdata/users/nathante/wikia_ecology
 #SBATCH --output=var_jobs/%A_%a.out
 #SBATCH --error=var_jobs/%A_%a.out
 TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p ./var_jobs.sh)
 ${TASK_CALL}
--- a/similarities/Makefile
+++ b/similarities/Makefile
@ -1,25 +1,28 @@
-all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms.parquet
+srun=srun -p compute-bigmem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40
 srun_huge=srun -p compute-hugemem -A comdata --mem=724g --time=200:00:00 -c 40
-# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet
+similarity_data=../../data/reddit_similarity
 tfidf_data=${similarity_data}/tfidf
 lsi_components=[10,50,100,200,300,400,500,600,700,850]
 lsi_similarities: ${similarity_data}/subreddit_comment_authors-tf_10k_LSI
-# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
+all: ${similarity_data}/subreddit_comment_authors-tf_10k.feather
 # 	start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.feather
-/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv
+${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
-	start_spark_and_run.sh 1 tfidf.py terms --topN=10000
+	 ${srun_huge} /bin/bash -c "source ~/.bashrc; python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$<"
-/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv
+${similarity_data}/subreddits_by_num_comments_nonsfw.csv: ../../data/reddit_submissions_by_subreddit.parquet ../../data/reddit_comments_by_subreddit.parquet
-	start_spark_and_run.sh 1 tfidf.py authors --topN=10000
+	../start_spark_and_run.sh 3 top_subreddits_by_comments.py
-/gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet 
+${tfidf_data}/comment_authors_100k.parquet: ../../data/reddit_ngrams/comment_authors_sorted.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
-	start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
+	../start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet
-/gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet
+../../data/reddit_ngrams/comment_authors_sorted.parquet:
-	start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
+	$(MAKE) -C ../ngrams
-# /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet
+../../data/reddit_submissions_by_subreddit.parquet:
-# 	start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet
+	$(MAKE) -C ../datasets
-/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
+../../data/reddit_comments_by_subreddit.parquet:
-	start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
+	$(MAKE) -C ../datasets
--- a/similarities/pycache/similarities_helper.cpython-37.pyc
+++ b/similarities/pycache/similarities_helper.cpython-37.pyc
--- a/similarities/cosine_similarities.py
+++ b/similarities/cosine_similarities.py
@ -2,11 +2,14 @@ import pandas as pd
 import fire
 from pathlib import Path
 from similarities_helper import similarities, column_similarities
 from functools import partial
 def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
    return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
 # change so that these take in an input as an optional argument (for speed, but also for idf).
 def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
 def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
--- a/similarities/job_script.sh
+++ b/similarities/job_script.sh
@ -1,4 +1,6 @@
 #!/usr/bin/bash
 source ~/.bashrc
 echo $(hostname)
 start_spark_cluster.sh
-spark-submit --master spark://$(hostname):18899 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
+spark-submit --verbose --master spark://$(hostname):43015 tfidf.py authors --topN=100000 --inpath=../../data/reddit_ngrams/comment_authors_sorted.parquet --outpath=../../data/reddit_similarity/tfidf/comment_authors_100k.parquet
 stop-all.sh
--- a/similarities/lsi_similarities.py
+++ b/similarities/lsi_similarities.py
@ -0,0 +1,86 @@
 import pandas as pd
 import fire
 from pathlib import Path
 from similarities_helper import *
 #from similarities_helper import similarities, lsi_column_similarities
 from functools import partial
 # inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet"
 # term_colname='authors'
 # outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_test_compex_LSI'
 # n_components=[10,50,100]
 # included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"
 # n_iter=5
 # random_state=1968
 # algorithm='randomized'
 # topN = None
 # from_date=None
 # to_date=None
 # min_df=None
 # max_df=None
 def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack',lsi_model=None):
    print(n_components,flush=True)
    if lsi_model is None:
        if type(n_components) == list:
            lsi_model = Path(outfile) / f'{max(n_components)}_{term_colname}_LSIMOD.pkl'
        else:
            lsi_model = Path(outfile) / f'{n_components}_{term_colname}_LSIMOD.pkl'
    simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model)
    return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
 # change so that these take in an input as an optional argument (for speed, but also for idf).
 def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968):
    res =  lsi_similarities(inpath,
                            'term',
                            outfile,
                            min_df,
                            max_df,
                            included_subreddits,
                            topN,
                            from_date,
                            to_date,
                            n_components=n_components,
                            algorithm = algorithm
                            )
    return res
 def author_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968):
    return lsi_similarities(inpath,
                            'author',
                            outfile,
                            min_df,
                            max_df,
                            included_subreddits,
                            topN,
                            from_date=from_date,
                            to_date=to_date,
                            n_components=n_components
                               )
 def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968):
    return lsi_similarities(inpath,
                            'author',
                            outfile,
                            min_df,
                            max_df,
                            included_subreddits,
                            topN,
                            from_date=from_date,
                            to_date=to_date,
                            tfidf_colname='relative_tf',
                            n_components=n_components,
                            algorithm=algorithm
                            )
 if __name__ == "__main__":
    fire.Fire({'term':term_lsi_similarities,
               'author':author_lsi_similarities,
               'author-tf':author_tf_similarities})
--- a/similarities/similarities_helper.py
+++ b/similarities/similarities_helper.py
@ -2,143 +2,190 @@ from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.sql import functions as f
 from enum import Enum
 from multiprocessing import cpu_count, Pool
 from pyspark.mllib.linalg.distributed import CoordinateMatrix
 from tempfile import TemporaryDirectory
 import pyarrow
 import pyarrow.dataset as ds
 from sklearn.metrics import pairwise_distances
 from scipy.sparse import csr_matrix, issparse
 from sklearn.decomposition import TruncatedSVD
 import pandas as pd
 import numpy as np
 import pathlib
 from datetime import datetime
 from pathlib import Path
 import pickle
 class tf_weight(Enum):
    MaxTF = 1
    Norm05 = 2
-infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet"
+# infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet"
 # cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet"
-def reindex_tfidf_time_interval(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
+# subreddits missing after this step don't have any terms that have a high enough idf
-    term = term_colname
+# try rewriting without merges
    term_id = term + '_id'
    term_id_new = term + '_id_new'
-    spark = SparkSession.builder.getOrCreate()
+# does reindex_tfidf, but without reindexing.
-    conf = spark.sparkContext.getConf()
+def reindex_tfidf(*args, **kwargs):
-    print(exclude_phrases)
+    df, tfidf_ds, ds_filter = _pull_or_reindex_tfidf(*args, **kwargs, reindex=True)
    tfidf_weekly = spark.read.parquet(infile)
-    # create the time interval
+    print("assigning names")
-    if from_date is not None:
+    subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id'])
-        if type(from_date) is str:
+    batches = subreddit_names.to_batches()
-            from_date = datetime.fromisoformat(from_date)
+    
    with Pool(cpu_count()) as pool:
        chunks = pool.imap_unordered(pull_names,batches) 
        subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
        subreddit_names = subreddit_names.set_index("subreddit_id")
-        tfidf_weekly = tfidf_weekly.filter(tfidf_weekly.week >= from_date)
+    new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
-        
+    new_ids = new_ids.set_index('subreddit_id')
-    if to_date is not None:
+    subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
-        if type(to_date) is str:
+    subreddit_names = subreddit_names.drop("subreddit_id",axis=1)
            to_date = datetime.fromisoformat(to_date)
        tfidf_weekly = tfidf_weekly.filter(tfidf_weekly.week < to_date)
    tfidf = tfidf_weekly.groupBy(["subreddit","week", term_id, term]).agg(f.sum("tf").alias("tf"))
    tfidf = _calc_tfidf(tfidf, term_colname, tf_weight.Norm05)
    tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits)
    tfidf = spark.read_parquet(tempdir.name)
    subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
-    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
+    return(df, subreddit_names)
    return(tempdir, subreddit_names)
-def reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False):
+def pull_tfidf(*args, **kwargs):
-    spark = SparkSession.builder.getOrCreate()
+    df, _, _ =  _pull_or_reindex_tfidf(*args, **kwargs, reindex=False)
-    conf = spark.sparkContext.getConf()
+    return df
    print(exclude_phrases)
-    tfidf = spark.read.parquet(infile)
+def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=None, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True):
    print(f"loading tfidf {infile}, week {week}, min_df {min_df}, max_df {max_df}", flush=True)
    if week is not None:
        tfidf_ds = ds.dataset(infile, partitioning='hive')
    else: 
        tfidf_ds = ds.dataset(infile)
    if included_subreddits is None:
        included_subreddits = select_topN_subreddits(topN)
    else:
-        included_subreddits = set(map(str.strip,map(str.lower,open(included_subreddits))))
+        included_subreddits = set(map(str.strip,open(included_subreddits)))
-    if exclude_phrases == True:
+    ds_filter = ds.field("subreddit").isin(included_subreddits)
        tfidf = tfidf.filter(~f.col(term_colname).contains("_"))
-    print("creating temporary parquet with matrix indicies")
+    if min_df is not None:
-    tempdir = prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits)
+        ds_filter &= ds.field("count") >= min_df
-    tfidf = spark.read.parquet(tempdir.name)
+    if max_df is not None:
-    subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
+        ds_filter &= ds.field("count") <= max_df
-    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
+
-    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
+    if week is not None:
-    spark.stop()
+        ds_filter &= ds.field("week") == week
-    return (tempdir, subreddit_names)
+
    if from_date is not None:
        ds_filter &= ds.field("week") >= from_date
    if to_date is not None:
        ds_filter &= ds.field("week") <= to_date
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    projection = {
        'subreddit_id':ds.field('subreddit_id'),
        term_id:ds.field(term_id),
        'relative_tf':ds.field("relative_tf").cast('float32')
        }
    if not rescale_idf:
        projection = {
            'subreddit_id':ds.field('subreddit_id'),
            term_id:ds.field(term_id),
            'relative_tf':ds.field('relative_tf').cast('float32'),
            'tf_idf':ds.field('tf_idf').cast('float32')}
    print(projection, flush=True)
    print(ds_filter, flush=True)
    df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
    df = df.to_pandas(split_blocks=True,self_destruct=True)
    print("assigning indexes",flush=True)
    if reindex:
        print("assigning indexes",flush=True)
        df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + 1
    else:
        df['subreddit_id_new'] = df['subreddit_id']
    if reindex:
        grouped = df.groupby(term_id)
        df[term_id_new] = grouped.ngroup() + 1 
    else:
        df[term_id_new] = df[term_id]
    if rescale_idf:
        print("computing idf", flush=True)
        df['new_count'] = grouped[term_id].transform('count')
        N_docs = df.subreddit_id_new.max() + 1
        df['idf'] = np.log(N_docs/(1+df.new_count),dtype='float32') + 1
        if tf_family == tf_weight.MaxTF:
            df["tf_idf"] = df.relative_tf * df.idf
        else: # tf_fam = tf_weight.Norm05
            df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf
    return (df, tfidf_ds, ds_filter)
-def similarities(infile, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
+def pull_names(batch):
    return(batch.to_pandas().drop_duplicates())
 def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'):
    '''
    tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
    '''
    if from_date is not None or to_date is not None:
        tempdir, subreddit_names = reindex_tfidf_time_interval(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False, from_date=from_date, to_date=to_date)
    else:
        tempdir, subreddit_names = reindex_tfidf(infile, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=False)
-    print("loading matrix")
+    def proc_sims(sims, outfile):
        if issparse(sims):
            sims = sims.todense()
        print(f"shape of sims:{sims.shape}")
        print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}",flush=True)
        sims = pd.DataFrame(sims)
        sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
        sims['_subreddit'] = subreddit_names.subreddit.values
        p = Path(outfile)
        output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
        output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
        output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
        p.parent.mkdir(exist_ok=True, parents=True)
        sims.to_feather(outfile)
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    entries, subreddit_names = reindex_tfidf(inpath, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date)
    mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))
    print("loading matrix")        
    #    mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname)
-    mat = read_tfidf_matrix(tempdir.name, term_colname, tfidf_colname)
+
    print(f'computing similarities on mat. mat.shape:{mat.shape}')
-    print(f"size of mat is:{mat.data.nbytes}")
+    print(f"size of mat is:{mat.data.nbytes}",flush=True)
    sims = simfunc(mat)
    del mat
-    if issparse(sims):
+    if hasattr(sims,'__next__'):
-        sims = sims.todense()
+        for simmat, name in sims:
-
+            proc_sims(simmat, Path(outfile)/(str(name) + ".feather"))
-    print(f"shape of sims:{sims.shape}")
+    else:
-    print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}")
+        proc_sims(sims, outfile)
    sims = pd.DataFrame(sims)
    sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
    sims['subreddit'] = subreddit_names.subreddit.values
    p = Path(outfile)
    output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
    output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
    output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
    sims.to_feather(outfile)
    tempdir.cleanup()
 def read_tfidf_matrix_weekly(path, term_colname, week, tfidf_colname='tf_idf'):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    dataset = ds.dataset(path,format='parquet')
    entries = dataset.to_table(columns=[tfidf_colname,'subreddit_id_new', term_id_new],filter=ds.field('week')==week).to_pandas()
    return(csr_matrix((entries[tfidf_colname], (entries[term_id_new]-1, entries.subreddit_id_new-1))))
 def read_tfidf_matrix(path, term_colname, tfidf_colname='tf_idf'):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    dataset = ds.dataset(path,format='parquet')
    print(f"tfidf_colname:{tfidf_colname}")
    entries = dataset.to_table(columns=[tfidf_colname, 'subreddit_id_new',term_id_new]).to_pandas()
    return(csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1))))
 def write_weekly_similarities(path, sims, week, names):
    sims['week'] = week
    p = pathlib.Path(path)
    if not p.is_dir():
-        p.mkdir()
+        p.mkdir(exist_ok=True,parents=True)
    # reformat as a pairwise list
-    sims = sims.melt(id_vars=['subreddit','week'],value_vars=names.subreddit.values)
+    sims = sims.melt(id_vars=['_subreddit','week'],value_vars=names.subreddit.values)
    sims.to_parquet(p / week.isoformat())
 def column_overlaps(mat):
@ -150,136 +197,74 @@ def column_overlaps(mat):
    return intersection / den
 def test_lsi_sims():
    term = "term"
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    t1 = time.perf_counter()
    entries, subreddit_names = reindex_tfidf("/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k_repartitioned.parquet",
                                             term_colname='term',
                                             min_df=2000,
                                             topN=10000
                                             )
    t2 = time.perf_counter()
    print(f"first load took:{t2 - t1}s")
    entries, subreddit_names = reindex_tfidf("/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet",
                                             term_colname='term',
                                             min_df=2000,
                                             topN=10000
                                             )
    t3=time.perf_counter()
    print(f"second load took:{t3 - t2}s")
    mat = csr_matrix((entries['tf_idf'],(entries[term_id_new], entries.subreddit_id_new)))
    sims = list(lsi_column_similarities(mat, [10,50]))
    sims_og = sims
    sims_test = list(lsi_column_similarities(mat,[10,50],algorithm='randomized',n_iter=10))
 # n_components is the latent dimensionality. sklearn recommends 100. More might be better
 # if n_components is a list we'll return a list of similarities with different latent dimensionalities
 # if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations.
 # this function takes the svd and then the column similarities of it
 def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None):
    # first compute the lsi of the matrix
    # then take the column similarities
    if type(n_components) is int:
        n_components = [n_components]
    n_components = sorted(n_components,reverse=True)
    svd_components = n_components[0]
    if lsi_model_load is not None and Path(lsi_model_load).exists():
        print("loading LSI")
        mod = pickle.load(open(lsi_model_load ,'rb'))
        lsi_model_save = lsi_model_load
    else:
        print("running LSI",flush=True)
        svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter)
        mod = svd.fit(tfidfmat.T)
    if lsi_model_save is not None:
        Path(lsi_model_save).parent.mkdir(exist_ok=True, parents=True)
        pickle.dump(mod, open(lsi_model_save,'wb'))
    print(n_components, flush=True)
    lsimat = mod.transform(tfidfmat.T)
    for n_dims in n_components:
        print("computing similarities", flush=True)
        sims = column_similarities(lsimat[:,np.arange(n_dims)])
        yield (sims, n_dims)
 def column_similarities(mat):
-    norm = np.matrix(np.power(mat.power(2).sum(axis=0),0.5,dtype=np.float32))
+    return 1 - pairwise_distances(mat,metric='cosine')
    mat = mat.multiply(1/norm)
    sims = mat.T @ mat
    return(sims)
 def prep_tfidf_entries_weekly(tfidf, term_colname, min_df, max_df, included_subreddits):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    if min_df is None:
        min_df = 0.1 * len(included_subreddits)
        tfidf = tfidf.filter(f.col('count') >= min_df)
    if max_df is not None:
        tfidf = tfidf.filter(f.col('count') <= max_df)
    tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
    # we might not have the same terms or subreddits each week, so we need to make unique ids for each week.
    sub_ids = tfidf.select(['subreddit_id','week']).distinct()
    sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.partitionBy('week').orderBy("subreddit_id")))
    tfidf = tfidf.join(sub_ids,['subreddit_id','week'])
    # only use terms in at least min_df included subreddits in a given week
    new_count = tfidf.groupBy([term_id,'week']).agg(f.count(term_id).alias('new_count'))
    tfidf = tfidf.join(new_count,[term_id,'week'],how='inner')
    # reset the term ids
    term_ids = tfidf.select([term_id,'week']).distinct()
    term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.partitionBy('week').orderBy(term_id)))
    tfidf = tfidf.join(term_ids,[term_id,'week'])
    tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
    tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
    tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
    tfidf = tfidf.repartition('week')
    tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
    return(tempdir)
 def prep_tfidf_entries(tfidf, term_colname, min_df, max_df, included_subreddits):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    if min_df is None:
        min_df = 0.1 * len(included_subreddits)
        tfidf = tfidf.filter(f.col('count') >= min_df)
    if max_df is not None:
        tfidf = tfidf.filter(f.col('count') <= max_df)
    tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
    # reset the subreddit ids
    sub_ids = tfidf.select('subreddit_id').distinct()
    sub_ids = sub_ids.withColumn("subreddit_id_new", f.row_number().over(Window.orderBy("subreddit_id")))
    tfidf = tfidf.join(sub_ids,'subreddit_id')
    # only use terms in at least min_df included subreddits
    new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
    tfidf = tfidf.join(new_count,term_id,how='inner')
    # reset the term ids
    term_ids = tfidf.select([term_id]).distinct()
    term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.orderBy(term_id)))
    tfidf = tfidf.join(term_ids,term_id)
    tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
    tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
    tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
    tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
    return tempdir
 # try computing cosine similarities using spark
 def spark_cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    if min_df is None:
        min_df = 0.1 * len(included_subreddits)
    tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
    tfidf = tfidf.cache()
    # reset the subreddit ids
    sub_ids = tfidf.select('subreddit_id').distinct()
    sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.orderBy("subreddit_id")))
    tfidf = tfidf.join(sub_ids,'subreddit_id')
    # only use terms in at least min_df included subreddits
    new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
    tfidf = tfidf.join(new_count,term_id,how='inner')
    # reset the term ids
    term_ids = tfidf.select([term_id]).distinct()
    term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.orderBy(term_id)))
    tfidf = tfidf.join(term_ids,term_id)
    tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
    tfidf = tfidf.withColumn("tf_idf", tfidf.relative_tf * tfidf.idf)
    # step 1 make an rdd of entires
    # sorted by (dense) spark subreddit id
    n_partitions = int(len(included_subreddits)*2 / 5)
    entries = tfidf.select(f.col(term_id_new)-1,f.col("subreddit_id_new")-1,"tf_idf").rdd.repartition(n_partitions)
    # put like 10 subredis in each partition
    # step 2 make it into a distributed.RowMatrix
    coordMat = CoordinateMatrix(entries)
    coordMat = CoordinateMatrix(coordMat.entries.repartition(n_partitions))
    # this needs to be an IndexedRowMatrix()
    mat = coordMat.toRowMatrix()
    #goal: build a matrix of subreddit columns and tf-idfs rows
    sim_dist = mat.columnSimilarities(threshold=similarity_threshold)
    return (sim_dist, tfidf)
 def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
@ -306,20 +291,20 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
    idf = idf.withColumn('idf',f.log(idf.subreddits_in_week) / (1+f.col('count'))+1)
    # collect the dictionary to make a pydict of terms to indexes
-    terms = idf.select([term,'week']).distinct() # terms are distinct
+    terms = idf.select([term]).distinct() # terms are distinct
-    terms = terms.withColumn(term_id,f.row_number().over(Window.partitionBy('week').orderBy(term))) # term ids are distinct
+    terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
    # make subreddit ids
-    subreddits = df.select(['subreddit','week']).distinct()
+    subreddits = df.select(['subreddit']).distinct()
-    subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.partitionBy("week").orderBy("subreddit")))
+    subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
-    df = df.join(subreddits,on=['subreddit','week'])
+    df = df.join(subreddits,on=['subreddit'])
    # map terms to indexes in the tfs and the idfs
-    df = df.join(terms,on=[term,'week']) # subreddit-term-id is unique
+    df = df.join(terms,on=[term]) # subreddit-term-id is unique
-    idf = idf.join(terms,on=[term,'week'])
+    idf = idf.join(terms,on=[term])
    # join on subreddit/term to create tf/dfs indexed by term
    df = df.join(idf, on=[term_id, term,'week'])
@ -331,9 +316,11 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
    else: # tf_fam = tf_weight.Norm05
        df = df.withColumn("tf_idf",  (0.5 + 0.5 * df.relative_tf) * df.idf)
-    return df
+    df = df.repartition('week')
    dfwriter = df.write.partitionBy("week")
    return dfwriter
-def _calc_tfidf(df, term_colname, tf_family):
+def _calc_tfidf(df, term_colname, tf_family, min_df=None, max_df=None):
    term = term_colname
    term_id = term + '_id'
@ -342,7 +329,7 @@ def _calc_tfidf(df, term_colname, tf_family):
    df = df.join(max_subreddit_terms, on='subreddit')
-    df = df.withColumn("relative_tf", df.tf / df.sr_max_tf)
+    df = df.withColumn("relative_tf", (df.tf / df.sr_max_tf))
    # group by term. term is unique
    idf = df.groupby([term]).count()
@ -351,7 +338,13 @@ def _calc_tfidf(df, term_colname, tf_family):
    idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1)
    # collect the dictionary to make a pydict of terms to indexes
-    terms = idf.select(term).distinct() # terms are distinct
+    terms = idf
    if min_df is not None:
        terms = terms.filter(f.col('count')>=min_df)
    if max_df is not None:
        terms = terms.filter(f.col('count')<=max_df)
    terms = terms.select(term).distinct() # terms are distinct
    terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
    # make subreddit ids
@ -361,12 +354,12 @@ def _calc_tfidf(df, term_colname, tf_family):
    df = df.join(subreddits,on='subreddit')
    # map terms to indexes in the tfs and the idfs
-    df = df.join(terms,on=term) # subreddit-term-id is unique
+    df = df.join(terms,on=term,how='inner') # subreddit-term-id is unique
-    idf = idf.join(terms,on=term)
+    idf = idf.join(terms,on=term,how='inner')
    # join on subreddit/term to create tf/dfs indexed by term
-    df = df.join(idf, on=[term_id, term])
+    df = df.join(idf, on=[term_id, term],how='inner')
    # agg terms by subreddit to make sparse tf/df vectors
    if tf_family == tf_weight.MaxTF:
@ -377,18 +370,36 @@ def _calc_tfidf(df, term_colname, tf_family):
    return df
-def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
+def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, min_df=None, max_df=None):
    term = term_colname
    term_id = term + '_id'
    # aggregate counts by week. now subreddit-term is distinct
    df = df.filter(df.subreddit.isin(include_subs))
    df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf'))
-    df = _calc_tfidf(df, term_colname, tf_family)
+    df = _calc_tfidf(df, term_colname, tf_family, min_df, max_df)
    df = df.repartition('subreddit')
    dfwriter = df.write
    return dfwriter
-    return df
+def select_topN_subreddits(topN, path="../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
 def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
    rankdf = pd.read_csv(path)
    included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
    return included_subreddits
 def repartition_tfidf(inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet",
                      outpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k_repartitioned.parquet"):
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(inpath)
    df = df.repartition(400,'subreddit')
    df.write.parquet(outpath,mode='overwrite')
 def repartition_tfidf_weekly(inpath="/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet",
                      outpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_repartitioned.parquet"):
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(inpath)
    df = df.repartition(400,'subreddit','week')
    dfwriter = df.write.partitionBy("week")
    dfwriter.parquet(outpath,mode='overwrite')
--- a/similarities/tfidf.py
+++ b/similarities/tfidf.py
@ -2,35 +2,45 @@
 import fire
 from pyspark.sql import SparkSession
 from pyspark.sql import functions as f
-from similarities_helper import build_tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
+from similarities_helper import tfidf_dataset, build_weekly_tfidf_dataset, select_topN_subreddits
 from functools import partial
-def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits):
+ def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=None, min_df=None, max_df=None):
    spark = SparkSession.builder.config(map={'spark.executor.memory':'900g','spark.executor.cores':128}).getOrCreate()
    df = spark.read.parquet(inpath)
    df = df.filter(~ f.col(term_colname).isin(exclude))
    if included_subreddits is not None:
-        include_subs = set(map(str.strip,map(str.lower, open(included_subreddits))))
+        include_subs = set(map(str.strip,open(included_subreddits)))
    else:
        include_subs = select_topN_subreddits(topN)
-    df = func(df, include_subs, term_colname)
+    include_subs = spark.sparkContext.broadcast(include_subs)
-    df.write.parquet(outpath,mode='overwrite',compression='snappy')
+    #    term_id = term_colname + "_id"
    if included_terms is not None:
        terms_df = spark.read.parquet(included_terms)
        terms_df = terms_df.select(term_colname).distinct()
        df = df.join(terms_df, on=term_colname, how='left_semi')
    dfwriter = func(df, include_subs.value, term_colname)
    dfwriter.parquet(outpath,mode='overwrite',compression='snappy')
    spark.stop()
-def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits):
+def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits, min_df, max_df):
-    return _tfidf_wrapper(build_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits)
+    tfidf_func = partial(tfidf_dataset, max_df=max_df, min_df=min_df)
    return _tfidf_wrapper(tfidf_func, inpath, outpath, topN, term_colname, exclude, included_subreddits)
 def tfidf_weekly(inpath, outpath, static_tfidf_path, topN, term_colname, exclude, included_subreddits):
    return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits, included_terms=static_tfidf_path)
 def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits):
    return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits)
 def tfidf_post_comment_authors(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/post_authors.parquet',
                  topN=25000,
                  included_subreddits=None):
    return tfidf("/gscratch/comdata/output/reddit_ngrams/post_comment_authors.parquet",
                 outpath,
                 topN,
@ -41,49 +51,64 @@ def tfidf_post_comment_authors(outpath='/gscratch/comdata/output/reddit_similari
 def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
                  outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
-                  topN=25000,
+                  topN=None,
-                  included_subreddits=None):
+                  included_subreddits=None,
                  min_df=None,
                  max_df=None):
    return tfidf(inpath,
                 outpath,
                 topN,
                 'author',
                 ['[deleted]','AutoModerator'],
-                 included_subreddits=included_subreddits
+                 included_subreddits=included_subreddits,
                 min_df=min_df,
                 max_df=max_df
                 )
-def tfidf_terms(outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
+def tfidf_terms(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
-                topN=25000,
+                outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
-                included_subreddits=None):
+                topN=None,
                included_subreddits=None,
                min_df=None,
                max_df=None):
-    return tfidf("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
+    return tfidf(inpath,
                 outpath,
                 topN,
                 'term',
                 [],
-                 included_subreddits=included_subreddits
+                 included_subreddits=included_subreddits,
                 min_df=min_df,
                 max_df=max_df
                 )
 def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
                         static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet",
                         outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
-                         topN=25000,
+                         topN=None,
                         included_subreddits=None):
    return tfidf_weekly(inpath,
                        outpath,
                        static_tfidf_path,
                        topN,
                        'author',
                        ['[deleted]','AutoModerator'],
                        included_subreddits=included_subreddits
                        )
-def tfidf_terms_weekly(outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
+def tfidf_terms_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
-                       topN=25000,
+                       static_tfidf_path="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet",
                       outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
                       topN=None,
                       included_subreddits=None):
-    return tfidf_weekly("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
+    return tfidf_weekly(inpath,
                        outpath,
                        static_tfidf_path,
                        topN,
                        'term',
                        [],
--- a/similarities/top_subreddits_by_comments.py
+++ b/similarities/top_subreddits_by_comments.py
@ -1,16 +1,20 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from datetime import datetime
 from pathlib import Path
 spark = SparkSession.builder.getOrCreate()
 conf = spark.sparkContext.getConf()
-submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet")
+submissions = spark.read.parquet("../../data/reddit_submissions_by_subreddit.parquet")
 submissions = submissions.filter(f.col("CreatedAt") <= datetime(2020,4,13))
 prop_nsfw = submissions.select(['subreddit','over_18']).groupby('subreddit').agg(f.mean(f.col('over_18').astype('double')).alias('prop_nsfw'))
-df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
+df = spark.read.parquet("../../data/reddit_comments_by_subreddit.parquet")
-
+df = df.filter(f.col("CreatedAt") <= datetime(2020,4,13))
 # remove /u/ pages
 df = df.filter(~df.subreddit.like("u_%"))
@ -26,4 +30,6 @@ df = df.toPandas()
 df = df.sort_values("n_comments")
-df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv', index=False)
+outpath = Path("../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv")
 outpath.parent.mkdir(exist_ok=True, parents=True)
 df.to_csv(str(outpath), index=False)
--- a/similarities/wang_similarity.py
+++ b/similarities/wang_similarity.py
@ -1,18 +0,0 @@
 from similarities_helper import similarities
 import numpy as np
 import fire 
 def wang_similarity(mat):
    non_zeros = (mat != 0).astype(np.float32)
    intersection = non_zeros.T @ non_zeros
    return intersection
 infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather"; min_df=1; included_subreddits=None; topN=10000; exclude_phrases=False; from_date=None; to_date=None
 def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None):
    return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date)
 if __name__ == "__main__":
    fire.Fire(wang_overlaps)
--- a/similarities/weekly_cosine_similarities.py
+++ b/similarities/weekly_cosine_similarities.py
@ -1,81 +0,0 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 import numpy as np
 import pyarrow
 import pandas as pd
 import fire
 from itertools import islice
 from pathlib import Path
 from similarities_helper import *
 from multiprocessing import Pool, cpu_count
 def _week_similarities(tempdir, term_colname, week):
        print(f"loading matrix: {week}")
        mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)
        print('computing similarities')
        sims = column_similarities(mat)
        del mat
        names = subreddit_names.loc[subreddit_names.week == week]
        sims = pd.DataFrame(sims.todense())
        sims = sims.rename({i: sr for i, sr in enumerate(names.subreddit.values)}, axis=1)
        sims['_subreddit'] = names.subreddit.values
        write_weekly_similarities(outfile, sims, week, names)
 #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet')
 def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500):
    spark = SparkSession.builder.getOrCreate()
    conf = spark.sparkContext.getConf()
    print(outfile)
    tfidf = spark.read.parquet(tfidf_path)
    if included_subreddits is None:
        included_subreddits = select_topN_subreddits(topN)
    else:
        included_subreddits = set(open(included_subreddits))
    print(f"computing weekly similarities for {len(included_subreddits)} subreddits")
    print("creating temporary parquet with matrix indicies")
    tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, max_df=None, included_subreddits=included_subreddits)
    tfidf = spark.read.parquet(tempdir.name)
    # the ids can change each week.
    subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
    spark.stop()
    weeks = sorted(list(subreddit_names.week.drop_duplicates()))
    # do this step in parallel if we have the memory for it.
    # should be doable with pool.map
    def week_similarities_helper(week):
        _week_similarities(tempdir, term_colname, week)
    with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
        list(pool.map(week_similarities_helper,weeks))
 def author_cosine_similarities_weekly(outfile, min_df=2 , included_subreddits=None, topN=500):
    return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
                                      outfile,
                                      'author',
                                      min_df,
                                      included_subreddits,
                                      topN)
 def term_cosine_similarities_weekly(outfile, min_df=None, included_subreddits=None, topN=500):
    return cosine_similarities_weekly('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
                                      outfile,
                                      'term',
                                      min_df,
                                      included_subreddits,
                                      topN)
 if __name__ == "__main__":
    fire.Fire({'authors':author_cosine_similarities_weekly,
               'terms':term_cosine_similarities_weekly})
--- a/start_spark_and_run.sh
+++ b/start_spark_and_run.sh
@ -0,0 +1,21 @@
 #!/usr/bin/env bash
 # Script to start a spark cluster and run a script on klone
 source $SPARK_CONF_DIR/spark-env.sh
 echo "#!/usr/bin/bash" > job_script.sh
 echo "source ~/.bashrc" >> job_script.sh
 echo "export PYSPARK_PYTHON=python3" >> job.script.sh
 echo "export JAVA_HOME=/gscratch/comdata/local/open-jdk" >> job.script.sh
 echo "export SPARK_CONF_DIR=/gscratch/comdata/local/spark_config" >> job.script.sh
 echo "echo \$(hostname)" >> job_script.sh
 echo "source $SPARK_CONF_DIR/spark-env.sh" >> job.script.sh
 echo "start_spark_cluster.sh" >> job_script.sh
 echo "spark-submit --verbose --master spark://\$(hostname):$SPARK_MASTER_PORT $2 ${@:3}" >> job_script.sh
 echo "stop-all.sh" >> job_script.sh
 #echo "singularity instance stop --all" >> job_script.sh
 chmod +x job_script.sh
 let "cpus = $1 * 40" 
 salloc -p compute-bigmem -A comdata --nodes=$1 --time=48:00:00 -c 40 --mem=362G --exclusive srun -n1 job_script.sh
--- a/start_spark_cluster.sh
+++ b/start_spark_cluster.sh
@ -0,0 +1,26 @@
 #!/usr/bin/env bash
 nodes="$(scontrol show hostnames)"
 export SPARK_MASTER_HOST=$(hostname)
 echo $SPARK_MASTER_HOST
 # singularity instance stop spark-boss
 # rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/spark-boss
 # for node in $nodes
 # dol
 #     echo $node
 #     ssh $node "singularity instance stop --all -F"
 # done
 # singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif spark-boss
 #apptainer exec /gscratch/comdata/users/nathante/containers/nathante.sif
 start-master.sh 
 for node in $nodes
 do
    # if [ "$node" != "$SPARK_BOSS" ]
    # then
    echo $node
    ssh -t $node start_spark_worker.sh $SPARK_MASTER_HOST
   # fi				
 done
--- a/start_spark_worker.sh
+++ b/start_spark_worker.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 # runs on worker node
 # instance_name=spark-worker-$(hostname)
 # echo $hostname
 # instance_url="instance://$instance_name"
 # singularity instance list
 # singularity instance stop -F "$instance_name"
 # singularity instance list
 # sleep 5
 # ls $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name
 # rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name
 # singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif $instance_name
 source /gscratch/comdata/env/cdsc_klone_bashrc
 source $SPARK_CONF_DIR/spark-env.sh
 echo $(which python3)
 echo $PYSPARK_PYTHON
 echo "start-worker.sh spark://$1:$SPARK_MASTER_PORT"
 start-worker.sh spark://$1:$SPARK_MASTER_PORT
--- a/timeseries/choose_clusters.py
+++ b/timeseries/choose_clusters.py
@ -1,96 +0,0 @@
 from pyarrow import dataset as ds
 import numpy as np
 import pandas as pd
 import plotnine as pn
 random = np.random.RandomState(1968)
 def load_densities(term_density_file="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
                   author_density_file="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather"):
    term_density = pd.read_feather(term_density_file)
    author_density = pd.read_feather(author_density_file)
    term_density.rename({'overlap_density':'term_density','index':'subreddit'},axis='columns',inplace=True)
    author_density.rename({'overlap_density':'author_density','index':'subreddit'},axis='columns',inplace=True)
    density = term_density.merge(author_density,on='subreddit',how='inner')
    return density
 def load_clusters(term_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
                  author_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather"):
    term_clusters = pd.read_feather(term_clusters_file)
    author_clusters = pd.read_feather(author_clusters_file)
    # rename, join and return
    term_clusters.rename({'cluster':'term_cluster'},axis='columns',inplace=True)
    author_clusters.rename({'cluster':'author_cluster'},axis='columns',inplace=True)
    clusters = term_clusters.merge(author_clusters,on='subreddit',how='inner')
    return clusters
 if __name__ == '__main__':
    df = load_densities()
    cl = load_clusters()
    df['td_rank'] = df.term_density.rank()
    df['ad_rank'] = df.author_density.rank()
    df['td_percentile'] = df.td_rank / df.shape[0]
    df['ad_percentile'] = df.ad_rank / df.shape[0]
    df = df.merge(cl, on='subreddit',how='inner')
    term_cluster_density = df.groupby('term_cluster').agg({'td_rank':['mean','min','max'],
                                                         'ad_rank':['mean','min','max'],
                                                         'td_percentile':['mean','min','max'],
                                                           'ad_percentile':['mean','min','max'],
                                                           'subreddit':['count']})
    author_cluster_density = df.groupby('author_cluster').agg({'td_rank':['mean','min','max'],
                                                         'ad_rank':['mean','min','max'],
                                                         'td_percentile':['mean','min','max'],
                                                           'ad_percentile':['mean','min','max'],
                                                           'subreddit':['count']})
    # which clusters have the most term_density?
    term_cluster_density.iloc[term_cluster_density.td_rank['mean'].sort_values().index]
    # which clusters have the most author_density?
    term_cluster_density.iloc[term_cluster_density.ad_rank['mean'].sort_values(ascending=False).index].loc[term_cluster_density.subreddit['count'] >= 5][0:20]
    high_density_term_clusters = term_cluster_density.loc[(term_cluster_density.td_percentile['mean'] > 0.75) & (term_cluster_density.subreddit['count'] > 5)]
    # let's just use term density instead of author density for now. We can do a second batch with author density next.
    chosen_clusters = high_density_term_clusters.sample(3,random_state=random)
    cluster_info = df.loc[df.term_cluster.isin(chosen_clusters.index.values)]
    chosen_subreddits = cluster_info.subreddit.values
    dataset = ds.dataset("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",format='parquet')
    comments = dataset.to_table(filter=ds.field("subreddit").isin(chosen_subreddits),columns=['id','subreddit','author','CreatedAt'])
    comments = comments.to_pandas()
    comments['week'] = comments.CreatedAt.dt.date - pd.to_timedelta(comments['CreatedAt'].dt.dayofweek, unit='d')
    author_timeseries = comments.loc[:,['subreddit','author','week']].drop_duplicates().groupby(['subreddit','week']).count().reset_index()
    for clid in chosen_clusters.index.values:
        ts = pd.read_feather(f"data/ts_term_cluster_{clid}.feather")
        pn.options.figure_size = (11.7,8.27)
        p = pn.ggplot(ts)
        p = p + pn.geom_line(pn.aes('week','value',group='subreddit'))
        p = p + pn.facet_wrap('~ subreddit')
        p.save(f"plots/ts_term_cluster_{clid}.png")
        fig, ax = pyplot.subplots(figsize=(11.7,8.27))
        g = sns.FacetGrid(ts,row='subreddit')
        g.map_dataframe(sns.scatterplot,'week','value',data=ts,ax=ax)
--- a/timeseries/cluster_timeseries.py
+++ b/timeseries/cluster_timeseries.py
@ -1,37 +0,0 @@
 import pandas as pd
 import numpy as np
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from choose_clusters import load_clusters, load_densities
 import fire
 from pathlib import Path
 def main(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
         author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather",
         term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
         author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather",
         output="data/subreddit_timeseries.parquet"):
    clusters = load_clusters(term_clusters_path, author_clusters_path)
    densities = load_densities(term_densities_path, author_densities_path)
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
    df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt")))
    # time of unique authors by series by week
    ts = df.select(['subreddit','week','author']).distinct().groupby(['subreddit','week']).count()
    ts = ts.repartition('subreddit')
    spk_clusters = spark.createDataFrame(clusters)
    ts = ts.join(spk_clusters, on='subreddit', how='inner')
    spk_densities = spark.createDataFrame(densities)
    ts = ts.join(spk_densities, on='subreddit', how='inner')
    ts.write.parquet(output, mode='overwrite')
 if __name__ == "__main__":
    fire.Fire(main)
--- a/tsne_subreddit_fit.feather
+++ b/tsne_subreddit_fit.feather
@ -1 +0,0 @@
 /annex/objects/SHA256E-s60874--d536adb0ec637fca262c4e1ec908dd8b4a5d1464047b583cd1a99cc6dba87191
--- a/visualization/Makefile
+++ b/visualization/Makefile
@ -1,11 +0,0 @@
 all: subreddit_author_tf_similarities_10000.html #comment_authors_10000.html
 # wang_tsne_10000.html
 # wang_tsne_10000.html:/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather tsne_vis.py
 # 	python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather --output=wang_tsne_10000.html
 # comment_authors_10000.html:/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather tsne_vis.py
 # 	python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather --output=comment_authors_10000.html
 subreddit_author_tf_similarities_10000.html:/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather tsne_vis.py
 	start_spark_and_run.sh 1 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather --output=subreddit_author_tf_similarities_10000.html
--- a/visualization/data/term_affinityprop_10000.feather
+++ b/visualization/data/term_affinityprop_10000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784
--- a/visualization/data/term_affinityprop_3000.feather
+++ b/visualization/data/term_affinityprop_3000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e
--- a/visualization/data/term_tsne_10000.feather
+++ b/visualization/data/term_tsne_10000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543
--- a/visualization/data/term_tsne_3000.feather
+++ b/visualization/data/term_tsne_3000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf
--- a/visualization/subreddit_author_tf_similarities_10000.html
+++ b/visualization/subreddit_author_tf_similarities_10000.html
--- a/visualization/subreddit_author_tf_similarities_10000_viewport.html
+++ b/visualization/subreddit_author_tf_similarities_10000_viewport.html
--- a/visualization/tsne_vis.py
+++ b/visualization/tsne_vis.py
@ -1,175 +0,0 @@
 import pyarrow
 import altair as alt
 alt.data_transformers.disable_max_rows()
 alt.data_transformers.enable('default')
 from sklearn.neighbors import NearestNeighbors
 import pandas as pd
 from numpy import random
 import fire
 import numpy as np
 def base_plot(plot_data):
 #    base = base.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')))
    cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))])
    #    subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit))
    cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ')
    # cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster')
    # cluster_select_and = cluster_click_select & cluster_select
    #
    #    subreddit_select = alt.selection_single(on='click',fields=['subreddit'],bind=subreddit_dropdown,name='subreddit_click')
    color = alt.condition(cluster_click_select ,
                          alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')),
                          alt.value("lightgray"))
    base = alt.Chart(plot_data).mark_text().encode(
        alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
        alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
        color=color,
        text='subreddit')
    base = base.add_selection(cluster_click_select)
    return base
 def zoom_plot(plot_data):
    chart = base_plot(plot_data)
    chart = chart.interactive()
    chart = chart.properties(width=1275,height=800)
    return chart
 def viewport_plot(plot_data):
    selector1 = alt.selection_interval(encodings=['x','y'],init={'x':(-65,65),'y':(-65,65)})
    selectorx2 = alt.selection_interval(encodings=['x'],init={'x':(30,40)})
    selectory2 = alt.selection_interval(encodings=['y'],init={'y':(-20,0)})
    base = base_plot(plot_data)
    viewport = base.mark_point(fillOpacity=0.2,opacity=0.2).encode(
        alt.X('x',axis=alt.Axis(grid=False)),
        alt.Y('y',axis=alt.Axis(grid=False)),
    )
    viewport = viewport.properties(width=600,height=400)
    viewport1 = viewport.add_selection(selector1)
    viewport2 = viewport.encode(
        alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1)),
        alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1))
    )
    viewport2 = viewport2.add_selection(selectorx2)
    viewport2 = viewport2.add_selection(selectory2)
    sr = base.encode(alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectorx2)),
                     alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectory2))
    )
    sr = sr.properties(width=1275,height=600)
    chart = (viewport1 | viewport2) & sr
    return chart
 def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
    tsne_data = tsne_data.merge(clusters,on='subreddit')
    centroids = tsne_data.groupby('cluster').agg({'x':np.mean,'y':np.mean})
    color_ids = np.arange(n_colors)
    distances = np.empty(shape=(centroids.shape[0],centroids.shape[0]))
    groups = tsne_data.groupby('cluster')
    points = np.array(tsne_data.loc[:,['x','y']])
    centers = np.array(centroids.loc[:,['x','y']])
    # point x centroid
    point_center_distances = np.linalg.norm((points[:,None,:] - centers[None,:,:]),axis=-1)
    # distances is cluster x point
    for gid, group in groups:
        c_dists = point_center_distances[group.index.values,:].min(axis=0)
        distances[group.cluster.values[0],] = c_dists        
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(centroids) 
    # distances, indices = nbrs.kneighbors()
    nearest = distances.argpartition(n_neighbors,0)
    indices = nearest[:n_neighbors,:].T
    # neighbor_distances = np.copy(distances)
    # neighbor_distances.sort(0)
    # neighbor_distances = neighbor_distances[0:n_neighbors,:]
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors,metric='precomputed').fit(distances) 
    # distances, indices = nbrs.kneighbors()
    color_assignments = np.repeat(-1,len(centroids))
    for i in range(len(centroids)):
        knn = indices[i]
        knn_colors = color_assignments[knn]
        available_colors = color_ids[list(set(color_ids) - set(knn_colors))]
        if(len(available_colors) > 0):
            color_assignments[i] = available_colors[0]
        else:
            raise Exception("Can't color this many neighbors with this many colors")
    centroids = centroids.reset_index()
    colors = centroids.loc[:,['cluster']]
    colors['color'] = color_assignments
    tsne_data = tsne_data.merge(colors,on='cluster')
    return(tsne_data)
 def build_visualization(tsne_data, clusters, output):
    # tsne_data = "/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather"
    # clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather"
    tsne_data = pd.read_feather(tsne_data)
    clusters = pd.read_feather(clusters)
    tsne_data = assign_cluster_colors(tsne_data,clusters,10,8)
    # sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index()
    # sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'})
    tsne_data = tsne_data.merge(sr_per_cluster,on='cluster')
    term_zoom_plot = zoom_plot(tsne_data)
    term_zoom_plot.save(output)
    term_viewport_plot = viewport_plot(tsne_data)
    term_viewport_plot.save(output.replace(".html","_viewport.html"))
 if __name__ == "__main__":
    fire.Fire(build_visualization)
 # commenter_data = pd.read_feather("tsne_author_fit.feather")
 # clusters = pd.read_feather('author_3000_clusters.feather')
 # commenter_data = assign_cluster_colors(commenter_data,clusters,10,8)
 # commenter_zoom_plot = zoom_plot(commenter_data)
 # commenter_viewport_plot = viewport_plot(commenter_data)
 # commenter_zoom_plot.save("subreddit_commenters_tsne_3000.html")
 # commenter_viewport_plot.save("subreddit_commenters_tsne_3000_viewport.html")
 # chart = chart.properties(width=10000,height=10000)
 # chart.save("test_tsne_whole.svg")
		`@ -0,0 +1,2 @@`
							`from .timeseries import load_clusters, load_densities, build_cluster_timeseries`
		`@ -1 +0,0 @@`
			`/annex/objects/SHA256E-s60874--d536adb0ec637fca262c4e1ec908dd8b4a5d1464047b583cd1a99cc6dba87191`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf`