changes for archiving.

2023-05-23 17:18:19 -07:00 · 2023-05-23 17:18:19 -07:00 · 07b0dff9bc
commit 07b0dff9bc
parent 811a0d87c4
47 changed files with 396 additions and 1549 deletions
--- a/bots/good_bad_bot.py
+++ b/bots/good_bad_bot.py
@ -1,74 +0,0 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.sql.types import FloatType
 import zlib
 def zlib_entropy_rate(s):
    sb = s.encode()
    if len(sb) == 0:
        return None
    else:
        return len(zlib.compress(s.encode(),level=6))/len(s.encode())
 zlib_entropy_rate_udf = f.udf(zlib_entropy_rate,FloatType())
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_author.parquet",compression='snappy')
 df = df.withColumn("saidbot",f.lower(f.col("body")).like("%bot%"))
 # df = df.filter(df.subreddit=='seattle')
 # df = df.cache()
 botreplies = df.filter(f.lower(df.body).rlike(".*[good|bad] bot.*"))
 botreplies = botreplies.select([f.col("parent_id").substr(4,100).alias("bot_comment_id"),f.lower(f.col("body")).alias("good_bad_bot"),f.col("link_id").alias("gbbb_link_id")])
 botreplies = botreplies.groupby(['bot_comment_id']).agg(f.count('good_bad_bot').alias("N_goodbad_votes"),
                                                        f.sum((f.lower(f.col('good_bad_bot')).like('%good bot%').astype("double"))).alias("n_good_votes"),
                                                        f.sum((f.lower(f.col('good_bad_bot')).like('%bad bot%').astype("double"))).alias("n_bad_votes"))
 comments_by_author = df.select(['author','id','saidbot']).groupBy('author').agg(f.count('id').alias("N_comments"),
                                                                                f.mean(f.col('saidbot').astype("double")).alias("prop_saidbot"),
                                                                                f.sum(f.col('saidbot').astype("double")).alias("n_saidbot"))
 # pd_comments_by_author = comments_by_author.toPandas()
 # pd_comments_by_author['frac'] = 500 / pd_comments_by_author['N_comments']
 # pd_comments_by_author.loc[pd_comments_by_author.frac > 1, 'frac'] = 1
 # fractions = pd_comments_by_author.loc[:,['author','frac']]
 # fractions = fractions.set_index('author').to_dict()['frac']
 # sampled_author_comments = df.sampleBy("author",fractions).groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
 df = df.withColumn("randn",f.randn(seed=1968))
 win = Window.partitionBy("author").orderBy("randn")
 df = df.withColumn("randRank",f.rank().over(win))
 sampled_author_comments = df.filter(f.col("randRank") <= 1000)
 sampled_author_comments = sampled_author_comments.groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
 author_entropy_rates = sampled_author_comments.select(['author',zlib_entropy_rate_udf(f.col('comments')).alias("entropy_rate")])
 parents = df.join(botreplies, on=df.id==botreplies.bot_comment_id,how='right_outer')
 win1 = Window.partitionBy("author")
 parents = parents.withColumn("first_bot_reply",f.min(f.col("CreatedAt")).over(win1))
 first_bot_reply = parents.filter(f.col("first_bot_reply")==f.col("CreatedAt"))
 first_bot_reply = first_bot_reply.withColumnRenamed("CreatedAt","FB_CreatedAt")
 first_bot_reply = first_bot_reply.withColumnRenamed("id","FB_id")
 comments_since_first_bot_reply = df.join(first_bot_reply,on = 'author',how='right_outer').filter(f.col("CreatedAt")>=f.col("first_bot_reply"))
 comments_since_first_bot_reply = comments_since_first_bot_reply.groupBy("author").agg(f.count("id").alias("N_comments_since_firstbot"))
 bots = parents.groupby(['author']).agg(f.sum('N_goodbad_votes').alias("N_goodbad_votes"),
                                          f.sum(f.col('n_good_votes')).alias("n_good_votes"),
                                          f.sum(f.col('n_bad_votes')).alias("n_bad_votes"),
                                          f.count(f.col('author')).alias("N_bot_posts"))
 bots = bots.join(comments_by_author,on="author",how='left_outer')
 bots = bots.join(comments_since_first_bot_reply,on="author",how='left_outer')
 bots = bots.join(author_entropy_rates,on='author',how='left_outer')
 bots = bots.orderBy("N_goodbad_votes",ascending=False)
 bots = bots.repartition(1)
 bots.write.parquet("/gscratch/comdata/output/reddit_good_bad_bot.parquet",mode='overwrite')
--- a/clustering/Makefile
+++ b/clustering/Makefile
@ -1,218 +1,36 @@
-#srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
+srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40 /bin/bash -c 
-srun_singularity=srun -p compute-bigmem -A comdata --time=48:00:00 --mem=362G -c 40
+similarity_data=../../data/reddit_similarity
-similarity_data=/gscratch/comdata/output/reddit_similarity
+clustering_data=../../data/reddit_clustering
 clustering_data=/gscratch/comdata/output/reddit_clustering
 kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
 umap_hdbscan_selection_grid=--min_cluster_sizes=[2] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf] --n_neighbors=[5,15,25,50,75,100] --learning_rate=[1] --min_dist=[0,0.1,0.25,0.5,0.75,0.9,0.99] --local_connectivity=[1] --densmap=[True,False] --n_components=[2,5,10,15,25]
 hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
 affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
 authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
 authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
 authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k
 authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI
 authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather
 authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
 authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k
 authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
-terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather
+all:authors_tf_10k_lsi
 terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI
 terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k
 terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI
 all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi
 terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv
 authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv
 authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv
 terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv
 authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv
 authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 ${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) 
 ${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans  --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) 
 ${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) 
 ${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) 
 ${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity  --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) 
 ${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) 
 ${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) 
 ${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan  --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) 
 ${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) 
 ## LSI Models
 ${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
 ${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans  --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
 ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
-	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
+	$(srun_singularity) -c "source ~/.bashrc; python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)"
 ${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
 ${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity  --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
 ${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
-	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
+	$(srun_singularity) -c "source ~/.bashrc; python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)"
 ${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
 ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan  --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
 ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
-	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
+	$(srun_singularity) -c "source ~/.bashrc; python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)"
 ${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv:umap_hdbscan_clustering_lsi.py
 	$(srun_singularity) python3 umap_hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/umap_hdbscan --savefile=${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv $(umap_hdbscan_selection_grid)
 ${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
 	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
 ${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
-	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
+	$(srun_singularity) -c "source ~/.bashrc; python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2"
-${authors_tf_10k_output_lsi}/best_umap_hdbscan_2.feather:${authors_tf_10k_output_lsi}/umap_hdbscan/selection_data.csv pick_best_clustering.py
+${authors_tf_10k_input_lsi}:
-	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
+	$(MAKE) -C ../similarities
-best_umap_hdbscan.feather:${authors_tf_10k_output_lsi}/best_umap_hdbscan_2.feather
+clean:
 # {'lsi_dimensions': 700, 'outpath': '/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/umap_hdbscan', 'silhouette_score': 0.27616957, 'name': 'mcs-2_ms-5_cse-0.05_csm-leaf_nn-15_lr-1.0_md-0.1_lc-1_lsi-700', 'n_clusters': 547, 'n_isolates': 2093, 'silhouette_samples': '/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/umap_hdbscan/silhouette_samples-mcs-2_ms-5_cse-0.05_csm-leaf_nn-15_lr-1.0_md-0.1_lc-1_lsi-700.feather', 'min_cluster_size': 2, 'min_samples': 5, 'cluster_selection_epsilon': 0.05, 'cluster_selection_method': 'leaf', 'n_neighbors': 15, 'learning_rate': 1.0, 'min_dist': 0.1, 'local_connectivity': 1, 'n_isolates_str': '2093', 'n_isolates_0': False}
 best_umap_grid=--min_cluster_sizes=[2] --min_samples=[5] --cluster_selection_epsilons=[0.05] --cluster_selection_methods=[leaf] --n_neighbors=[15] --learning_rate=[1] --min_dist=[0.1] --local_connectivity=[1] --save_step1=True
 umap_hdbscan_coords:
 	python3 umap_hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/umap_hdbscan --savefile=/dev/null ${best_umap_grid}
 clean_affinity:
 	rm -f ${authors_10k_output}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
 	rm -f ${terms_10k_output}/affinity/selection_data.csv
 clean_kmeans:
 	rm -f ${authors_10k_output}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output}/kmeans/selection_data.csv
 clean_hdbscan:
 	rm -f ${authors_10k_output}/hdbscan/selection_data.csv
 	rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
 	rm -f ${terms_10k_output}/hdbscan/selection_data.csv
 clean_authors:
 	rm -f ${authors_10k_output}/affinity/selection_data.csv
 	rm -f ${authors_10k_output}/kmeans/selection_data.csv
 	rm -f ${authors_10k_output}/hdbscan/selection_data.csv
 clean_authors_tf:
 	rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
 clean_terms:
 	rm -f ${terms_10k_output}/affinity/selection_data.csv
 	rm -f ${terms_10k_output}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output}/hdbscan/selection_data.csv
 clean_lsi_affinity:
 	rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
 clean_lsi_kmeans:
 	rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
 clean_lsi_hdbscan:
 	rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
 clean_lsi_authors:
 	rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
 clean_lsi_authors_tf:
 	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
-clean_lsi_terms:
+PHONY: clean 
 	rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
 clean: clean_affinity clean_kmeans clean_hdbscan
 PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k best_umap_hdbscan.feather umap_hdbscan_coords
 # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather
 # 	 $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather $(clustering_data)/subreddit_comment_authors_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85
 # $(clustering_data)/comment_terms_100k.feather:clustering.py $(similarity_data)/subreddit_comment_terms_100k.feather
 # 	$(srun_singularity) python3 clustering.py $(similarity_data)/comment_terms_10000.feather $(clustering_data)/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5
 # $(clustering_data)/subreddit_comment_author-tf_100k.feather:clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.feather
 # 	$(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.parquet $(clustering_data)/subreddit_comment_author-tf_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85
 # it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap
 # /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
 # 	./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85
 # /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
 # 	start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather
 # /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
 # 	python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather
 # /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
 # #	$srun_cdsc python3
 # 	start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather
--- a/clustering/affinity/subreddit_comment_authors_10000_a.feather
+++ b/clustering/affinity/subreddit_comment_authors_10000_a.feather
--- a/clustering/fit_tsne.py
+++ b/clustering/fit_tsne.py
@ -1,34 +0,0 @@
 import fire
 import pyarrow
 import pandas as pd
 from numpy import random
 import numpy as np
 from sklearn.manifold import TSNE
 similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet"
 def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
    '''
    similarities: feather file with a dataframe of similarity scores
    learning_rate: parameter controlling how fast the model converges. Too low and you get outliers. Too high and you get a ball.
    perplexity: number of neighbors to use. the default of 50 is often good.
    '''
    df = pd.read_feather(similarities)
    n = df.shape[0]
    mat = np.array(df.drop('_subreddit',1),dtype=np.float64)
    mat[range(n),range(n)] = 1
    mat[mat > 1] = 1
    dist = 2*np.arccos(mat)/np.pi
    tsne_model = TSNE(2,learning_rate=750,perplexity=50,n_iter=10000,metric='precomputed',early_exaggeration=20,n_jobs=-1)
    tsne_fit_model = tsne_model.fit(dist)
    tsne_fit_whole = tsne_fit_model.fit_transform(dist)
    plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']})
    plot_data.to_feather(output)
 if __name__ == "__main__":
    fire.Fire(fit_tsne)
--- a/clustering/umap_hdbscan_clustering.py
+++ b/clustering/umap_hdbscan_clustering.py
@ -1,230 +0,0 @@
 from clustering_base import clustering_result, clustering_job, twoway_clustering_job
 from hdbscan_clustering import hdbscan_clustering_result
 import umap
 from grid_sweep import twoway_grid_sweep
 from dataclasses import dataclass
 import hdbscan
 from sklearn.neighbors import NearestNeighbors
 import plotnine as pn
 import numpy as np
 from itertools import product, starmap, chain
 import pandas as pd
 from multiprocessing import cpu_count
 import fire
 def test_select_hdbscan_clustering():
    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
    #                           "test_hdbscan_author30k",
    #                           min_cluster_sizes=[2],
    #                           min_samples=[1,2],
    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
    #                           cluster_selection_methods=['eom','leaf'],
    #                           lsi_dimensions='all')
    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI"
    outpath = "test_umap_hdbscan_lsi"
    min_cluster_sizes=[2,3,4]
    min_samples=[1,2,3]
    cluster_selection_epsilons=[0,0.1,0.3,0.5]
    cluster_selection_methods=[1]
    lsi_dimensions='all'
    n_neighbors = [5,10,15,25,35,70,100]
    learning_rate = [0.1,0.5,1,2]
    min_dist = [0.5,1,1.5,2]
    local_connectivity = [1,2,3,4,5]
    hdbscan_params = {"min_cluster_sizes":min_cluster_sizes, "min_samples":min_samples, "cluster_selection_epsilons":cluster_selection_epsilons, "cluster_selection_methods":cluster_selection_methods}
    umap_params = {"n_neighbors":n_neighbors, "learning_rate":learning_rate, "min_dist":min_dist, "local_connectivity":local_connectivity}
    gs = umap_hdbscan_grid_sweep(inpath, "all", outpath, hdbscan_params,umap_params)
    # gs.run(20)
    # gs.save("test_hdbscan/lsi_sweep.csv")
    # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
    # job1.run()
    # print(job1.get_info())
    # df = pd.read_csv("test_hdbscan/selection_data.csv")
    # test_select_hdbscan_clustering()
    # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
    # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
    # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
 class umap_hdbscan_grid_sweep(twoway_grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 umap_params,
                 hdbscan_params):
        super().__init__(umap_hdbscan_job, inpath, outpath, self.namer, umap_params, hdbscan_params)
    def namer(self,
              min_cluster_size,
              min_samples,
              cluster_selection_epsilon,
              cluster_selection_method,
              n_components,
              n_neighbors,
              learning_rate,
              min_dist,
              local_connectivity,
              densmap
              ):
        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}_nc-{n_components}_nn-{n_neighbors}_lr-{learning_rate}_md-{min_dist}_lc-{local_connectivity}_dm-{densmap}"
@dataclass
 class umap_hdbscan_clustering_result(hdbscan_clustering_result):
    n_components:int
    n_neighbors:int
    learning_rate:float
    min_dist:float
    local_connectivity:int
    densmap:bool
 class umap_hdbscan_job(twoway_clustering_job):
    def __init__(self, infile, outpath, name,
                 umap_args = {"n_components":2,"n_neighbors":15, "learning_rate":1, "min_dist":1, "local_connectivity":1,'densmap':False},
                 hdbscan_args = {"min_cluster_size":2, "min_samples":1, "cluster_selection_epsilon":0, "cluster_selection_method":'eom'},
                 *args,
                 **kwargs):
        super().__init__(infile,
                         outpath,
                         name,
                         call1=umap_hdbscan_job._umap_embedding,
                         call2=umap_hdbscan_job._hdbscan_clustering,
                         args1=umap_args,
                         args2=hdbscan_args,
                         *args,
                         **kwargs
                         )
        self.n_components = umap_args['n_components']
        self.n_neighbors = umap_args['n_neighbors']
        self.learning_rate = umap_args['learning_rate']
        self.min_dist = umap_args['min_dist']
        self.local_connectivity = umap_args['local_connectivity']
        self.densmap = umap_args['densmap']
        self.min_cluster_size = hdbscan_args['min_cluster_size']
        self.min_samples = hdbscan_args['min_samples']
        self.cluster_selection_epsilon = hdbscan_args['cluster_selection_epsilon']
        self.cluster_selection_method = hdbscan_args['cluster_selection_method']
    def after_run(self):
        coords = self.step1.embedding_
        self.cluster_data['x'] = coords[:,0]
        self.cluster_data['y'] = coords[:,1]
        super().after_run()
    def _umap_embedding(mat, **umap_args):
        print(f"running umap embedding. umap_args:{umap_args}")
        umapmodel = umap.UMAP(metric='precomputed', **umap_args)
        umapmodel = umapmodel.fit(mat)
        return umapmodel
    def _hdbscan_clustering(mat, umapmodel, **hdbscan_args):
        print(f"running hdbascan clustering. hdbscan_args:{hdbscan_args}")
        umap_coords = umapmodel.transform(mat)
        clusterer = hdbscan.HDBSCAN(metric='euclidean',
                                    core_dist_n_jobs=cpu_count(),
                                    **hdbscan_args
                                    )
        clustering = clusterer.fit(umap_coords)
        return(clustering)
    def get_info(self):
        result = super().get_info()
        self.result = umap_hdbscan_clustering_result(**result.__dict__,
                                                     min_cluster_size=self.min_cluster_size,
                                                     min_samples=self.min_samples,
                                                     cluster_selection_epsilon=self.cluster_selection_epsilon,
                                                     cluster_selection_method=self.cluster_selection_method,
                                                     n_components = self.n_components,
                                                     n_neighbors = self.n_neighbors,
                                                     learning_rate = self.learning_rate,
                                                     min_dist = self.min_dist,
                                                     local_connectivity=self.local_connectivity,
                                                     densmap=self.densmap
                                                     )
        return self.result
 def run_umap_hdbscan_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1],
                                densmap=[False],
                                min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
    """Run umap + hdbscan clustering once or more with different parameters.
    Usage:
    umap_hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_neighbors=<csv> --learning_rate=<csv> --min_dist=<csv> --local_connectivity=<csv> --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    n_neighbors: umap parameter takes integers greater than 1
    learning_rate: umap parameter takes positive real values
    min_dist: umap parameter takes positive real values
    local_connectivity: umap parameter takes positive integers
    min_cluster_sizes: one or more integers indicating the minumum cluster size
    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
    cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
    cluster_selection_method: "eom" or "leaf" eom gives larger clusters. 
    """    
    umap_args = {'n_neighbors':list(map(int, n_neighbors)),
                 'learning_rate':list(map(float,learning_rate)),
                 'min_dist':list(map(float,min_dist)),
                 'local_connectivity':list(map(int,local_connectivity)),
                 'n_components':list(map(int, n_components)),
                 'densmap':list(map(bool,densmap))
                 }
    hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
                    'min_samples':list(map(int,min_samples)),
                    'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)),
                    'cluster_selection_method':cluster_selection_methods}
    obj = umap_hdbscan_grid_sweep(inpath,
                                  outpath,
                                  umap_args,
                                  hdbscan_args)
    obj.run(cores=10)
    obj.save(savefile)
 def KNN_distances_plot(mat,outname,k=2):
    nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
    distances, indices = nbrs.kneighbors(mat)
    d2 = distances[:,-1]
    df = pd.DataFrame({'dist':d2})
    df = df.sort_values("dist",ascending=False)
    df['idx'] = np.arange(0,d2.shape[0]) + 1
    p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
                                                                                      breaks = np.arange(0,10)/10)
    p.save(outname,width=16,height=10)
 def make_KNN_plots():
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
 if __name__ == "__main__":
    fire.Fire(run_umap_hdbscan_grid_sweep)
 #    test_select_hdbscan_clustering()
    #fire.Fire(select_hdbscan_clustering)  
--- a/clustering/umap_hdbscan_clustering_lsi.py
+++ b/clustering/umap_hdbscan_clustering_lsi.py
@ -1,113 +0,0 @@
 from umap_hdbscan_clustering import umap_hdbscan_job, umap_hdbscan_grid_sweep, umap_hdbscan_clustering_result
 from lsi_base import twoway_lsi_grid_sweep, lsi_mixin, lsi_result_mixin
 from grid_sweep import twoway_grid_sweep
 import fire
 from dataclasses import dataclass
@dataclass
 class umap_hdbscan_clustering_result_lsi(umap_hdbscan_clustering_result, lsi_result_mixin):
    pass 
 class umap_hdbscan_lsi_job(umap_hdbscan_job, lsi_mixin):
    def __init__(self, infile, outpath, name, umap_args, hdbscan_args, lsi_dims):
        super().__init__(
            infile,
            outpath,
            name,
            umap_args,
            hdbscan_args
        )
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        partial_result = super().get_info()
        self.result = umap_hdbscan_clustering_result_lsi(**partial_result.__dict__,
                                                         lsi_dimensions=self.lsi_dims)
        return self.result
 class umap_hdbscan_lsi_grid_sweep(twoway_lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 umap_args,
                 hdbscan_args
                 ):
        super().__init__(umap_hdbscan_lsi_job,
                         _umap_hdbscan_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         umap_args,
                         hdbscan_args
                         )
 class _umap_hdbscan_lsi_grid_sweep(twoway_grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 umap_args,
                 hdbscan_args,
                 ):
        self.lsi_dim = lsi_dim
        self.jobtype = umap_hdbscan_lsi_job
        super().__init__(self.jobtype, inpath, outpath, self.namer, umap_args, hdbscan_args, lsi_dim)
    def namer(self, *args, **kwargs):
        s = umap_hdbscan_grid_sweep.namer(self, *args, **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 def run_umap_hdbscan_lsi_grid_sweep(savefile, inpath, outpath, n_neighbors = [15], n_components=[2], learning_rate=[1], min_dist=[1], local_connectivity=[1], 
                                densmap=[False],
                                    min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom'], lsi_dimensions='all'):
    """Run hdbscan clustering once or more with different parameters.
    Usage:
    hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit clusterings.
    min_cluster_sizes: one or more integers indicating the minumum cluster size
    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
    cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
    cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. 
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    """    
    umap_args = {'n_neighbors':list(map(int, n_neighbors)),
                 'learning_rate':list(map(float,learning_rate)),
                 'min_dist':list(map(float,min_dist)),
                 'local_connectivity':list(map(int,local_connectivity)),
                 'n_components':list(map(int, n_components)),
                 'densmap':list(map(bool,densmap))
                 }
    hdbscan_args = {'min_cluster_size':list(map(int,min_cluster_sizes)),
                    'min_samples':list(map(int,min_samples)),
                    'cluster_selection_epsilon':list(map(float,cluster_selection_epsilons)),
                    'cluster_selection_method':cluster_selection_methods}
    obj = umap_hdbscan_lsi_grid_sweep(inpath,
                                      lsi_dimensions,
                                      outpath,
                                      umap_args,
                                      hdbscan_args
                                      )
    obj.run(10)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_umap_hdbscan_lsi_grid_sweep)
--- a/clustering/validation.py
+++ b/clustering/validation.py
@ -0,0 +1,4 @@
 from sklearn import metrics
 from sklearn.cluster import AffinityPropagation
 from functools import partial
 # sillouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
--- a/datasets/Makefile
+++ b/datasets/Makefile
@ -0,0 +1,28 @@
 all: ../../data/reddit_comments_by_subreddit.parquet ../../data/reddit_submissions_by_subreddit.parquet
 ../../data/reddit_comments_by_subreddit.parquet:../../data/temp/reddit_comments.parquet
 	../start_spark_and_run.sh 4 comments_2_parquet_part2.py
 ../../data/temp/reddit_comments.parquet: comments_task_list.sh run_comments_jobs.sbatch
 	mkdir -p comments_jobs
 	mkdir -p ../../data/temp/
 	sbatch --wait --array=1-$(shell cat comments_task_list.sh | wc -l) run_comments_jobs.sbatch 0
 temp_reddit_comments.parquet: ../../data/temp/reddit_comments.parquet
 comments_task_list.sh: comments_2_parquet_part1.py
 	srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 bash -c "source ~/.bashrc && python3 comments_2_parquet_part1.py gen_task_list --overwrite=False"
 submissions_task_list.sh: submissions_2_parquet_part1.py
 	srun -p compute-bigmem -A comdata --nodes=1 --mem-per-cpu=9g -c 40 --time=120:00:00 python3 submissions_2_parquet_part1.py gen_task_list
 ../../data/reddit_submissions_by_subreddit.parquet:../../data/temp/reddit_submissions.parquet
 	../start_spark_and_run.sh 4 submissions_2_parquet_part2.py
 ../../data/temp/reddit_submissions.parquet: submissions_task_list.sh run_submissions_jobs.sbatch
 	mkdir -p submissions_jobs
 	rm -rf ../../data/temp/reddit_submissions.parquet
 	mkdir -p ../../data/temp/
 	sbatch --wait --array=1-$(shell cat submissions_task_list.sh | wc -l) run_submissions_jobs.sbatch 0
 temp_reddit_submissions.parquet: ../../data/temp/reddit_submissions.parquet
--- a/datasets/comments_2_parquet_part1.py
+++ b/datasets/comments_2_parquet_part1.py
@ -47,11 +47,11 @@ def parse_comment(comment, names= None):
    return tuple(row)
-#    conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','/gscratch/comdata/spark_tmp')])
+#    conf = sc._conf.setAll([('spark.executor.memory', '20g'), ('spark.app.name', 'extract_reddit_timeline'), ('spark.executor.cores', '26'), ('spark.cores.max', '26'), ('spark.driver.memory','84g'),('spark.driver.maxResultSize','0'),('spark.local.dir','../../data/spark_tmp')])
 def parse_dump(partition):
-    dumpdir = f"/gscratch/comdata/raw_data/reddit_dumps/comments/{partition}"
+    dumpdir = f"../../data/reddit_dumps/comments/{partition}"
    stream = open_input_file(dumpdir)
    rows = map(parse_comment, stream)
@ -76,11 +76,11 @@ def parse_dump(partition):
        pa.field('error', pa.string(), nullable=True),
    ])
-    p = Path("/gscratch/comdata/output/temp/reddit_comments.parquet")
+    p = Path("../../data/temp/reddit_comments.parquet")
    p.mkdir(exist_ok=True,parents=True)
    N=10000
-    with pq.ParquetWriter(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet",
+    with pq.ParquetWriter(f"../../data/temp/reddit_comments.parquet/{partition}.parquet",
                          schema=schema,
                          compression='snappy',
                          flavor='spark') as writer:
@ -96,12 +96,12 @@ def parse_dump(partition):
        writer.close()
-def gen_task_list(dumpdir="/gscratch/comdata/raw_data/reddit_dumps/comments", overwrite=True):
+def gen_task_list(dumpdir="../../data/raw_data/reddit_dumps/comments", overwrite=True):
    files = list(find_dumps(dumpdir,base_pattern="RC_20*.*"))
    with open("comments_task_list.sh",'w') as of:
        for fpath in files:
            partition = os.path.split(fpath)[1]
-            if (not Path(f"/gscratch/comdata/output/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
+            if (not Path(f"../../data/temp/reddit_comments.parquet/{partition}.parquet").exists()) or (overwrite is True):
                of.write(f'python3 comments_2_parquet_part1.py parse_dump {partition}\n')
--- a/datasets/job_script.sh
+++ b/datasets/job_script.sh
@ -1,6 +0,0 @@
 #!/usr/bin/bash
 source ~/.bashrc
 echo $(hostname)
 start_spark_cluster.sh
 spark-submit --verbose --master spark://$(hostname):43015 submissions_2_parquet_part2.py 
 stop-all.sh
--- a/datasets/run_comments_jobs.sbatch
+++ b/datasets/run_comments_jobs.sbatch
@ -0,0 +1,24 @@
 #!/bin/bash
 ## tf reddit comments
 #SBATCH --job-name="cdsc_reddit; parse comment dumps"
 ## Allocation Definition
 #SBATCH --account=comdata
 #SBATCH --partition=compute-bigmem
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=24:00:00
 ## Memory per node
 #SBATCH --mem=8G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks=1
 #SBATCH 
 #SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/datasets
 #SBATCH --output=comments_jobs/%A_%a.out
 #SBATCH --error=comments_jobs/%A_%a.out
 . /opt/ohpc/admin/lmod/lmod/init/profile
 source ~/.bashrc
 TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p ./comments_task_list.sh)
 ${TASK_CALL}
--- a/datasets/run_submissions_jobs.sbatch
+++ b/datasets/run_submissions_jobs.sbatch
@ -0,0 +1,23 @@
 #!/bin/bash
 ## tf reddit comments
 #SBATCH --job-name="cdsc_reddit; parse submission dumps"
 ## Allocation Definition
 #SBATCH --account=comdata-ckpt
 #SBATCH --partition=ckpt
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=24:00:00
 ## Memory per node
 #SBATCH --mem=8G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks=1
 #SBATCH 
 #SBATCH --chdir /gscratch/comdata/users/nathante/cdsc_reddit/datasets
 #SBATCH --output=submissions_jobs/%A_%a.out
 #SBATCH --error=submissions_jobs/%A_%a.out
 TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p ./submissions_task_list.sh)
 ${TASK_CALL}
--- a/density/Makefile
+++ b/density/Makefile
@ -1,16 +1,7 @@
-all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather
+all: ../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather
-/gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
+../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py ../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
-	start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum
+	../start_spark_and_run.sh 1 overlap_density.py authors --inpath="../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
-/gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
+../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather:
-	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum
+	$(MAKE) -C ../similarities
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
--- a/density/job_script.sh
+++ b/density/job_script.sh
@ -1,4 +1,6 @@
 #!/usr/bin/bash
 source ~/.bashrc
 echo $(hostname)
 start_spark_cluster.sh
-singularity exec  /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
+spark-submit --verbose --master spark://$(hostname):43015 overlap_density.py authors --inpath=../../data/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=../../data/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
-singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
+stop-all.sh
--- a/dumps/remove_duplicate_comments.py
+++ b/dumps/remove_duplicate_comments.py
@ -0,0 +1,34 @@
 from pathlib import Path
 from itertools import chain, groupby
 dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/comments")
 zst_files = dumpdir.glob("*.zst")
 bz2_files = dumpdir.glob("*.bz2")
 xz_files = dumpdir.glob("*.xz")
 all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
 groups = groupby(all_files, key = lambda p: p.stem)
 kept_paths = []
 removed_paths = []
 priority = ['.zst','.xz','.bz2']
 for stem, files in groups:
    keep_file = None
    remove_files = []
    for f in files:
        if keep_file is None:
            keep_file = f
        elif priority.index(keep_file.suffix) > priority.index(f.suffix):
            remove_files.append(keep_file)
            keep_file = f
        else:
            remove_files.append(f)
    kept_paths.append(keep_file)
    removed_paths.extend(remove_files)
 (dumpdir / "to_remove").mkdir()
 for f in removed_paths:
    f.rename(f.parent / "to_remove" / f.name)
--- a/dumps/remove_duplicate_submissions.py
+++ b/dumps/remove_duplicate_submissions.py
@ -0,0 +1,34 @@
 from pathlib import Path
 from itertools import chain, groupby
 dumpdir = Path("/gscratch/comdata/raw_data/reddit_dumps/submissions")
 zst_files = dumpdir.glob("*.zst")
 bz2_files = dumpdir.glob("*.bz2")
 xz_files = dumpdir.glob("*.xz")
 all_files = sorted(list(chain(zst_files, bz2_files, xz_files)))
 groups = groupby(all_files, key = lambda p: p.stem)
 kept_paths = []
 removed_paths = []
 priority = ['.zst','.xz','.bz2']
 for stem, files in groups:
    keep_file = None
    remove_files = []
    for f in files:
        if keep_file is None:
            keep_file = f
        elif priority.index(keep_file.suffix) > priority.index(f.suffix):
            remove_files.append(keep_file)
            keep_file = f
        else:
            remove_files.append(f)
    kept_paths.append(keep_file)
    removed_paths.extend(remove_files)
 (dumpdir / "to_remove").mkdir()
 for f in removed_paths:
    f.rename(f.parent / "to_remove" / f.name)
--- a/examples/pyarrow_reading.py
+++ b/examples/pyarrow_reading.py
@ -1,17 +0,0 @@
 import pyarrow.dataset as ds
 # A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory. 
 #dataset = ds.dataset(pathlib.Path('/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet/'), format='parquet', partitioning='hive')
 dataset = ds.dataset('/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/', format='parquet')
 # let's get all the comments to two subreddits:
 subreddits_to_pull = ['seattle','seattlewa']
 # a table is a low-level structured data format.  This line pulls data into memory. Setting metadata_n_threads > 1 gives a little speed boost.
 table = dataset.to_table(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext'])
 # Since data from just these 2 subreddits fits in memory we can just turn our table into a pandas dataframe.
 df = table.to_pandas()
 # We should save this smaller dataset so we don't have to wait 15 min to pull from parquet next time.
 df.to_csv("mydataset.csv")
--- a/examples/pyarrow_streaming.py
+++ b/examples/pyarrow_streaming.py
@ -1,38 +0,0 @@
 import pyarrow.dataset as ds
 from itertools import groupby
 # A pyarrow dataset abstracts reading, writing, or filtering a parquet file. It does not read dataa into memory. 
 dataset = ds.dataset('/gscratch/comdata/output/reddit_submissions_by_author.parquet', format='parquet')
 # let's get all the comments to two subreddits:
 subreddits_to_pull = ['seattlewa','seattle']
 # instead of loading the data into a pandas dataframe all at once we can stream it.
 scan_tasks = dataset.scan(filter = ds.field('subreddit').isin(subreddits_to_pull), columns=['id','subreddit','CreatedAt','author','ups','downs','score','subreddit_id','stickied','title','url','is_self','selftext'])
 # simple function to execute scantasks and generate rows
 def iterate_rows(scan_tasks):
    for st in scan_tasks:
        for rb in st.execute():
            df = rb.to_pandas()
            for t in df.itertuples():
                yield t
 row_iter = iterate_rows(scan_tasks)
 # now we can use python's groupby function to read one author at a time
 # note that the same author can appear more than once since the record batches may not be in the correct order.
 author_submissions = groupby(row_iter, lambda row: row.author)
 count_dict = {}
 for auth, posts in author_submissions:
    if auth in count_dict:
        count_dict[auth] = count_dict[auth] + 1
    else:
        count_dict[auth] = 1
 # since it's partitioned and sorted by author, we get one group for each author 
 any([ v != 1 for k,v in count_dict.items()])
--- a/ngrams/#ngrams_helper.py#
+++ b/ngrams/#ngrams_helper.py#
--- a/ngrams/Makefile
+++ b/ngrams/Makefile
@ -0,0 +1,25 @@
 outputdir=../../data/reddit_ngrams/
 inputdir=../../data/reddit_comments_by_subreddit.parquet
 authors_tfdir=${outputdir}/comment_authors.parquet
 srun=sbatch --wait --verbose run_job.sbatch
 all: ${outputdir}/comment_authors_sorted.parquet/_SUCCESS 
 tf_task_list_1: tf_comments.py
 	${srun} bash -c "python3 tf_comments.py gen_task_list --mwe_pass='first' --outputdir=${outputdir} --tf_task_list=$@ --inputdir=${inputdir}"
 ${outputdir}/comment_terms.parquet:tf_task_list_1
 	mkdir -p sbatch_log
 	sbatch --wait --verbose --array=1-$(shell cat $< | wc -l) run_array.sbatch 0 $<
 ${outputdir}/comment_authors.parquet:${outputdir}/comment_terms.parquet
 	-
 ${outputdir}/comment_authors_sorted.parquet:${outputdir}/comment_authors.parquet sort_tf_comments.py
 	../start_spark_and_run.sh 3 sort_tf_comments.py --inparquet=$< --outparquet=$@ --colname=author
 ${outputdir}/comment_authors_sorted.parquet/_SUCCESS:${outputdir}/comment_authors_sorted.parquet
 ${inputdir}:
 	$(MAKE) -C ../datasets
--- a/ngrams/run_array.sbatch
+++ b/ngrams/run_array.sbatch
@ -0,0 +1,19 @@
 #!/bin/bash
 #SBATCH --job-name=reddit_comment_term_frequencies
 #SBATCH --account=comdata
 #SBATCH --partition=compute-bigmem
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --cpus-per-task=1
 #SBATCH --mem-per-cpu=9g
 #SBATCH --ntasks=1
 #SBATCH --export=ALL
 #SBATCH --time=48:00:00
 #SBATCH --chdir=/gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams
 #SBATCH --error="sbatch_log/%A_%a.out"
 #SBATCH --output="sbatch_log/%A_%a.out"
 TASK_NUM=$(($SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p $2)
 ${TASK_CALL}
--- a/ngrams/run_job.sbatch
+++ b/ngrams/run_job.sbatch
@ -0,0 +1,18 @@
 #!/bin/bash
 #SBATCH --job-name="simulate measurement error models"
 ## Allocation Definition
 #SBATCH --account=comdata
 #SBATCH --partition=compute-bigmem
 ## Resources
 #SBATCH --nodes=1    
 ## Walltime (4 hours)
 #SBATCH --time=4:00:00
 ## Memory per node
 #SBATCH --mem=4G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --chdir /gscratch/comdata/users/nathante/partitioning_reddit/dataverse/cdsc_reddit/ngrams/
 #SBATCH --output=sbatch_log/%A_%a.out
 #SBATCH --error=sbatch_log/%A_%a.err
 echo "$@"
 "$@"
--- a/ngrams/tf_comments.py
+++ b/ngrams/tf_comments.py
@ -3,6 +3,7 @@ import pandas as pd
 import pyarrow as pa
 import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 import pyarrow.compute as pc
 from itertools import groupby, islice, chain
 import fire
 from collections import Counter
@ -15,11 +16,12 @@ import string
 from random import random
 from redditcleaner import clean
 from pathlib import Path
 from datetime import datetime
 # compute term frequencies for comments in each subreddit by week
-def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', input_dir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None):
+def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", mwe_pass = 'first', excluded_users=None):
-    dataset = ds.dataset(Path(input_dir)/partition, format='parquet')
+    dataset = ds.dataset(Path(inputdir)/partition, format='parquet')
    outputdir = Path(outputdir)
    samppath = outputdir / "reddit_comment_ngrams_10p_sample"
@ -38,6 +40,7 @@ def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/',
        if ngram_path.exists():
            ngram_path.unlink()
    dataset = dataset.filter(pc.field("CreatedAt") <= pa.scalar(datetime(2020,4,13)))
    batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
@ -160,9 +163,9 @@ def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/',
    outchunksize = 10000
-    termtf_outputdir = (outputdir / "comment_terms")
+    termtf_outputdir = (outputdir / "comment_terms.parquet")
    termtf_outputdir.mkdir(parents=True, exist_ok=True)
-    authortf_outputdir = (outputdir / "comment_authors")
+    authortf_outputdir = (outputdir / "comment_authors.parquet")
    authortf_outputdir.mkdir(parents=True, exist_ok=True)    
    termtf_path = termtf_outputdir / partition
    authortf_path = authortf_outputdir / partition
@ -196,12 +199,12 @@ def weekly_tf(partition, outputdir = '/gscratch/comdata/output/reddit_ngrams/',
        author_writer.close()
-def gen_task_list(mwe_pass='first', outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None):
+def gen_task_list(mwe_pass='first', inputdir="/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/", outputdir='/gscratch/comdata/output/reddit_ngrams/', tf_task_list='tf_task_list', excluded_users_file=None):
-    files = os.listdir("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet/")
+    files = os.listdir(inputdir)
    with open(tf_task_list,'w') as outfile:
        for f in files:
            if f.endswith(".parquet"):
-                outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n")
+                outfile.write(f"./tf_comments.py weekly_tf --mwe-pass {mwe_pass} --inputdir {inputdir} --outputdir {outputdir} --excluded_users {excluded_users_file} {f}\n")
 if __name__ == "__main__":
    fire.Fire({"gen_task_list":gen_task_list,
--- a/ngrams/top_comment_phrases.py
+++ b/ngrams/top_comment_phrases.py
@ -1,69 +0,0 @@
 #!/usr/bin/env python3
 from pyspark.sql import functions as f
 from pyspark.sql import Window
 from pyspark.sql import SparkSession
 import numpy as np
 import fire
 from pathlib import Path
 def main(ngram_dir="/gscratch/comdata/output/reddit_ngrams"):
    spark = SparkSession.builder.getOrCreate()
    ngram_dir = Path(ngram_dir)
    ngram_sample = ngram_dir / "reddit_comment_ngrams_10p_sample"
    df = spark.read.text(str(ngram_sample))
    df = df.withColumnRenamed("value","phrase")
    # count phrase occurrances
    phrases = df.groupby('phrase').count()
    phrases = phrases.withColumnRenamed('count','phraseCount')
    phrases = phrases.filter(phrases.phraseCount > 10)
    # count overall
    N = phrases.select(f.sum(phrases.phraseCount).alias("phraseCount")).collect()[0].phraseCount
    print(f'analyzing PMI on a sample of {N} phrases') 
    logN = np.log(N)
    phrases = phrases.withColumn("phraseLogProb", f.log(f.col("phraseCount")) - logN)
    # count term occurrances
    phrases = phrases.withColumn('terms',f.split(f.col('phrase'),' '))
    terms = phrases.select(['phrase','phraseCount','phraseLogProb',f.explode(phrases.terms).alias('term')])
    win = Window.partitionBy('term')
    terms = terms.withColumn('termCount',f.sum('phraseCount').over(win))
    terms = terms.withColumnRenamed('count','termCount')
    terms = terms.withColumn('termLogProb',f.log(f.col('termCount')) - logN)
    terms = terms.groupBy(terms.phrase, terms.phraseLogProb, terms.phraseCount).sum('termLogProb')
    terms = terms.withColumnRenamed('sum(termLogProb)','termsLogProb')
    terms = terms.withColumn("phrasePWMI", f.col('phraseLogProb') - f.col('termsLogProb'))
    # join phrases to term counts
    df = terms.select(['phrase','phraseCount','phraseLogProb','phrasePWMI'])
    df = df.sort(['phrasePWMI'],descending=True)
    df = df.sortWithinPartitions(['phrasePWMI'],descending=True)
    pwmi_dir = ngram_dir / "reddit_comment_ngrams_pwmi.parquet/"
    df.write.parquet(str(pwmi_dir), mode='overwrite', compression='snappy')
    df = spark.read.parquet(str(pwmi_dir))
    df.write.csv(str(ngram_dir / "reddit_comment_ngrams_pwmi.csv/"),mode='overwrite',compression='none')
    df = spark.read.parquet(str(pwmi_dir))
    df = df.select('phrase','phraseCount','phraseLogProb','phrasePWMI')
    # choosing phrases occurring at least 3500 times in the 10% sample (35000 times) and then with a PWMI of at least 3 yeids about 65000 expressions.
    #
    df = df.filter(f.col('phraseCount') > 3500).filter(f.col("phrasePWMI")>3)
    df = df.toPandas()
    df.to_feather(ngram_dir / "multiword_expressions.feather")
    df.to_csv(ngram_dir / "multiword_expressions.csv")
 if __name__ == '__main__':
    fire.Fire(main)
--- a/run_array.sbatch
+++ b/run_array.sbatch
@ -0,0 +1,22 @@
 #!/bin/bash
 ## tf reddit comments
 #SBATCH --job-name="wikia ecology; fit var models"
 ## Allocation Definition
 #SBATCH --account=comdata-ckpt
 #SBATCH --partition=ckpt
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=24:00:00
 ## Memory per node
 #SBATCH --mem=8G
 #SBATCH --cpus-per-task=1
 #SBATCH --ntasks=1
 #SBATCH 
 #SBATCH --chdir /gscratch/comdata/users/nathante/wikia_ecology
 #SBATCH --output=var_jobs/%A_%a.out
 #SBATCH --error=var_jobs/%A_%a.out
 TASK_NUM=$(( SLURM_ARRAY_TASK_ID + $1))
 TASK_CALL=$(sed -n ${TASK_NUM}p ./var_jobs.sh)
 ${TASK_CALL}
--- a/similarities/Makefile
+++ b/similarities/Makefile
@ -1,138 +1,28 @@
 #all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet
 # srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
 # srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh
 srun=srun -p compute-bigmem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40
-srun_huge=srun -p compute-hugemem -A comdata --mem-per-cpu=9g --time=200:00:00 -c 40 
+srun_huge=srun -p compute-hugemem -A comdata --mem=724g --time=200:00:00 -c 40
-similarity_data=/gscratch/scrubbed/comdata/reddit_similarity
+
 similarity_data=../../data/reddit_similarity
 tfidf_data=${similarity_data}/tfidf
-tfidf_weekly_data=${similarity_data}/tfidf_weekly
+lsi_components=[10,50,100,200,300,400,500,600,700,850]
 similarity_weekly_data=${similarity_data}/weekly
 lsi_components=[10,50,100,200,300,400,500,600,700,850,1000,1500]
-lsi_similarities: ${similarity_data}/subreddit_comment_terms_10k_LSI ${similarity_data}/subreddit_comment_authors-tf_10k_LSI ${similarity_data}/subreddit_comment_authors_10k_LSI ${similarity_data}/subreddit_comment_terms_30k_LSI ${similarity_data}/subreddit_comment_authors-tf_30k_LSI ${similarity_data}/subreddit_comment_authors_30k_LSI
+lsi_similarities: ${similarity_data}/subreddit_comment_authors-tf_10k_LSI
 all: ${similarity_data}/subreddit_comment_authors-tf_10k.feather
-all: ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather  ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather
+${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 	 ${srun_huge} /bin/bash -c "source ~/.bashrc; python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$<"
-#all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather  ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet
+${similarity_data}/subreddits_by_num_comments_nonsfw.csv: ../../data/reddit_submissions_by_subreddit.parquet ../../data/reddit_comments_by_subreddit.parquet
 	../start_spark_and_run.sh 3 top_subreddits_by_comments.py
-#${tfidf_weekly_data}/comment_terms_100k.parquet ${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet ${similarity_weekly_data}/comment_authors_100k.parquet  ${similarity_weekly_data}/comment_terms_30k.parquet ${similarity_weekly_data}/comment_authors_30k.parquet
+${tfidf_data}/comment_authors_100k.parquet: ../../data/reddit_ngrams/comment_authors_sorted.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 	../start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet
-# /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_weekly_130k.parquet
+../../data/reddit_ngrams/comment_authors_sorted.parquet:
 	$(MAKE) -C ../ngrams
-# all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet
+../../data/reddit_submissions_by_subreddit.parquet:
 	$(MAKE) -C ../datasets
-${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_terms.parquet
+../../data/reddit_comments_by_subreddit.parquet:
-	 ${srun} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet
+	$(MAKE) -C ../datasets
 ${similarity_data}/subreddit_comment_terms_10k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	 ${srun} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000
 ${similarity_data}/subreddit_comment_terms_10k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	 ${srun_huge} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200
 ${similarity_data}/subreddit_comment_terms_30k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	 ${srun_huge} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200 --inpath=$<
 ${similarity_data}/subreddit_comment_terms_30k.feather: ${tfidf_data}/comment_terms_30k.parquet similarities_helper.py
 	 ${srun_huge} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000 --inpath=$<
 ${similarity_data}/subreddit_comment_authors_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py
 	 ${srun_huge} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000 --inpath=$<
 ${similarity_data}/subreddit_comment_authors_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py
 	 ${srun_huge} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000 --inpath=$<
 ${similarity_data}/subreddit_comment_authors_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun_huge} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$<
 ${similarity_data}/subreddit_comment_authors_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun_huge} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=10 --inpath=$<
 ${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000 --inpath=$<
 ${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000
 ${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun_huge} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=10 --inpath=$<
 ${similarity_data}/subreddit_comment_authors-tf_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun_huge} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=10 --inpath=$<
 ${similarity_data}/subreddit_comment_terms_100k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	 ${srun} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000
 ${similarity_data}/subreddit_comment_authors_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000
 ${similarity_data}/subreddit_comment_authors-tf_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	 ${srun} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000
 ${similarity_data}/subreddits_by_num_comments_nonsfw.csv:
 	start_spark_and_run.sh 3 top_subreddits_by_comments.py
 ${tfidf_data}/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 #	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 3 tfidf.py terms --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_terms_100k.parquet
 ${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 #	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 3 tfidf.py terms --topN=30000 --inpath=$< --outpath=${tfidf_data}/comment_terms_30k.feather
 ${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 #	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 3 tfidf.py terms --topN=10000 --inpath=$< --outpath=${tfidf_data}/comment_terms_10k.feather
 ${tfidf_data}/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 #	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 3 tfidf.py authors --topN=100000 --inpath=$< --outpath=${tfidf_data}/comment_authors_100k.parquet
 ${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 #	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 3 tfidf.py authors --topN=10000 --inpath=$< --outpath=${tfidf_data}/comment_authors_10k.parquet
 ${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 #	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 3 tfidf.py authors --topN=30000 --inpath=$< --outpath=${tfidf_data}/comment_authors_30k.parquet
 ${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 	start_spark_and_run.sh 3 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet
 ${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_ppnum_comments.csv
 	start_spark_and_run.sh 3 tfidf.py authors_weekly --topN=100000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet
 ${tfidf_weekly_data}/comment_terms_30k.parquet:  /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 	start_spark_and_run.sh 2 tfidf.py terms_weekly --topN=30000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet
 ${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv
 	start_spark_and_run.sh 3 tfidf.py authors_weekly --topN=30000 --inpath=$< --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet
 ${similarity_weekly_data}/comment_terms_100k.parquet: weekly_cosine_similarities.py similarities_helper.py ${tfidf_weekly_data}/comment_terms_100k.parquet
 	 ${srun} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_terms_100k.parquet
 ${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_authors_100k.parquet
 	 ${srun} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet
 ${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_terms_30k.parquet
 	 ${srun} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet
 ,${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments_nonsfw.csv ${tfidf_weekly_data}/comment_authors_30k.parquet
 	 ${srun} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet
 # ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv
 # 	start_spark_and_run.sh 1 tfidf.py authors_weekly --topN=130000
 # /gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet 
 # 	start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
 # /gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet
 # 	start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
 # /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py ${tfidf_weekly_data}/comment_authors.parquet
 # 	start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet
 # /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
 # 	start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
--- a/similarities/pycache/similarities_helper.cpython-37.pyc
+++ b/similarities/pycache/similarities_helper.cpython-37.pyc
--- a/similarities/job_script.sh
+++ b/similarities/job_script.sh
@ -1,4 +1,6 @@
 #!/usr/bin/bash
 source ~/.bashrc
 echo $(hostname)
 start_spark_cluster.sh
-singularity exec  /gscratch/comdata/users/nathante/containers/nathante.sif spark-submit --master spark://$(hostname):7077 tfidf.py authors --topN=100000 --inpath=/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet --outpath=/gscratch/scrubbed/comdata/reddit_similarity/tfidf/comment_authors_100k.parquet
+spark-submit --verbose --master spark://$(hostname):43015 tfidf.py authors --topN=100000 --inpath=../../data/reddit_ngrams/comment_authors_sorted.parquet --outpath=../../data/reddit_similarity/tfidf/comment_authors_100k.parquet
-singularity exec /gscratch/comdata/users/nathante/containers/nathante.sif stop-all.sh
+stop-all.sh
--- a/similarities/similarities_helper.py
+++ b/similarities/similarities_helper.py
@ -43,7 +43,7 @@ def reindex_tfidf(*args, **kwargs):
    new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
    new_ids = new_ids.set_index('subreddit_id')
    subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
-    subreddit_names = subreddit_names.drop("subreddit_id",1)
+    subreddit_names = subreddit_names.drop("subreddit_id",axis=1)
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    return(df, subreddit_names)
@ -51,8 +51,9 @@ def pull_tfidf(*args, **kwargs):
    df, _, _ =  _pull_or_reindex_tfidf(*args, **kwargs, reindex=False)
    return df
-def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True):
+def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=None, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True):
-    print(f"loading tfidf {infile}", flush=True)
+    print(f"loading tfidf {infile}, week {week}, min_df {min_df}, max_df {max_df}", flush=True)
    if week is not None:
        tfidf_ds = ds.dataset(infile, partitioning='hive')
    else: 
@ -97,20 +98,21 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
            'relative_tf':ds.field('relative_tf').cast('float32'),
            'tf_idf':ds.field('tf_idf').cast('float32')}
-        print(projection)
+    print(projection, flush=True)
-
+    print(ds_filter, flush=True)
    df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
    df = df.to_pandas(split_blocks=True,self_destruct=True)
    print("assigning indexes",flush=True)
    if reindex:
-        df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup()
+        print("assigning indexes",flush=True)
        df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup() + 1
    else:
        df['subreddit_id_new'] = df['subreddit_id']
    if reindex:
        grouped = df.groupby(term_id)
-        df[term_id_new] = grouped.ngroup()
+        df[term_id_new] = grouped.ngroup() + 1 
    else:
        df[term_id_new] = df[term_id]
@ -126,17 +128,6 @@ def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, inclu
    return (df, tfidf_ds, ds_filter)
    with Pool(cpu_count()) as pool:
        chunks = pool.imap_unordered(pull_names,batches) 
        subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
    subreddit_names = subreddit_names.set_index("subreddit_id")
    new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
    new_ids = new_ids.set_index('subreddit_id')
    subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
    subreddit_names = subreddit_names.drop("subreddit_id",1)
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    return(df, subreddit_names)
 def pull_names(batch):
    return(batch.to_pandas().drop_duplicates())
@ -170,7 +161,7 @@ def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=Non
    term_id_new = term + '_id_new'
    entries, subreddit_names = reindex_tfidf(inpath, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date)
-    mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new)))
+    mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)))
    print("loading matrix")        
@ -256,22 +247,20 @@ def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=196
    else:
        print("running LSI",flush=True)
        svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter)
        mod = svd.fit(tfidfmat.T)
    lsimat = mod.transform(tfidfmat.T)
    if lsi_model_save is not None:
        Path(lsi_model_save).parent.mkdir(exist_ok=True, parents=True)
        pickle.dump(mod, open(lsi_model_save,'wb'))
-    sims_list = []
+    print(n_components, flush=True)
    lsimat = mod.transform(tfidfmat.T)
    for n_dims in n_components:
        print("computing similarities", flush=True)
        sims = column_similarities(lsimat[:,np.arange(n_dims)])
        if len(n_components) > 1:
        yield (sims, n_dims)
-        else:
+
            return sims
 def column_similarities(mat):
@ -327,11 +316,11 @@ def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weig
    else: # tf_fam = tf_weight.Norm05
        df = df.withColumn("tf_idf",  (0.5 + 0.5 * df.relative_tf) * df.idf)
-    df = df.repartition(400,'subreddit','week')
+    df = df.repartition('week')
    dfwriter = df.write.partitionBy("week")
    return dfwriter
-def _calc_tfidf(df, term_colname, tf_family):
+def _calc_tfidf(df, term_colname, tf_family, min_df=None, max_df=None):
    term = term_colname
    term_id = term + '_id'
@ -349,7 +338,13 @@ def _calc_tfidf(df, term_colname, tf_family):
    idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1)
    # collect the dictionary to make a pydict of terms to indexes
-    terms = idf.select(term).distinct() # terms are distinct
+    terms = idf
    if min_df is not None:
        terms = terms.filter(f.col('count')>=min_df)
    if max_df is not None:
        terms = terms.filter(f.col('count')<=max_df)
    terms = terms.select(term).distinct() # terms are distinct
    terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
    # make subreddit ids
@ -359,12 +354,12 @@ def _calc_tfidf(df, term_colname, tf_family):
    df = df.join(subreddits,on='subreddit')
    # map terms to indexes in the tfs and the idfs
-    df = df.join(terms,on=term) # subreddit-term-id is unique
+    df = df.join(terms,on=term,how='inner') # subreddit-term-id is unique
-    idf = idf.join(terms,on=term)
+    idf = idf.join(terms,on=term,how='inner')
    # join on subreddit/term to create tf/dfs indexed by term
-    df = df.join(idf, on=[term_id, term])
+    df = df.join(idf, on=[term_id, term],how='inner')
    # agg terms by subreddit to make sparse tf/df vectors
    if tf_family == tf_weight.MaxTF:
@ -375,19 +370,19 @@ def _calc_tfidf(df, term_colname, tf_family):
    return df
-def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
+def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05, min_df=None, max_df=None):
    term = term_colname
    term_id = term + '_id'
    # aggregate counts by week. now subreddit-term is distinct
    df = df.filter(df.subreddit.isin(include_subs))
    df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf'))
-    df = _calc_tfidf(df, term_colname, tf_family)
+    df = _calc_tfidf(df, term_colname, tf_family, min_df, max_df)
    df = df.repartition('subreddit')
    dfwriter = df.write
    return dfwriter
-def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
+def select_topN_subreddits(topN, path="../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
    rankdf = pd.read_csv(path)
    included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
    return included_subreddits
--- a/similarities/top_subreddits_by_comments.py
+++ b/similarities/top_subreddits_by_comments.py
@ -1,16 +1,20 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from datetime import datetime
 from pathlib import Path
 spark = SparkSession.builder.getOrCreate()
 conf = spark.sparkContext.getConf()
-submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet")
+submissions = spark.read.parquet("../../data/reddit_submissions_by_subreddit.parquet")
 submissions = submissions.filter(f.col("CreatedAt") <= datetime(2020,4,13))
 prop_nsfw = submissions.select(['subreddit','over_18']).groupby('subreddit').agg(f.mean(f.col('over_18').astype('double')).alias('prop_nsfw'))
-df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
+df = spark.read.parquet("../../data/reddit_comments_by_subreddit.parquet")
-
+df = df.filter(f.col("CreatedAt") <= datetime(2020,4,13))
 # remove /u/ pages
 df = df.filter(~df.subreddit.like("u_%"))
@ -26,4 +30,6 @@ df = df.toPandas()
 df = df.sort_values("n_comments")
-df.to_csv('/gscratch/scrubbed/comdata/reddit_similarity/subreddits_by_num_comments_nonsfw.csv', index=False)
+outpath = Path("../../data/reddit_similarity/subreddits_by_num_comments_nonsfw.csv")
 outpath.parent.mkdir(exist_ok=True, parents=True)
 df.to_csv(str(outpath), index=False)
--- a/similarities/wang_similarity.py
+++ b/similarities/wang_similarity.py
@ -1,18 +0,0 @@
 from similarities_helper import similarities
 import numpy as np
 import fire 
 def wang_similarity(mat):
    non_zeros = (mat != 0).astype(np.float32)
    intersection = non_zeros.T @ non_zeros
    return intersection
 infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather"; min_df=1; included_subreddits=None; topN=10000; exclude_phrases=False; from_date=None; to_date=None
 def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None):
    return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date)
 if __name__ == "__main__":
    fire.Fire(wang_overlaps)
--- a/similarities/weekly_cosine_similarities.py
+++ b/similarities/weekly_cosine_similarities.py
@ -1,149 +0,0 @@
 #!/usr/bin/env python3
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 import numpy as np
 import pyarrow
 import pyarrow.dataset as ds
 import pandas as pd
 import fire
 from itertools import islice, chain
 from pathlib import Path
 from similarities_helper import pull_tfidf, column_similarities, write_weekly_similarities, lsi_column_similarities
 from scipy.sparse import csr_matrix
 from multiprocessing import Pool, cpu_count
 from functools import partial
 import pickle
 # tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity_weekly/comment_authors_tfidf.parquet"
 # #tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data//comment_authors_compex.parquet"
 # min_df=2
 # included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"
 # max_df = None
 # topN=100
 # term_colname='author'
 # # outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet'
 # # included_subreddits=None
 outfile="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity_weekly/comment_authors.parquet"; infile="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_authors_tfidf.parquet"; included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"; lsi_model="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI/2000_authors_LSIMOD.pkl"; n_components=1500; algorithm="randomized"; term_colname='author'; tfidf_path=infile; random_state=1968;
 # static_tfidf = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet"
 # dftest = spark.read.parquet(static_tfidf)
 def _week_similarities(week, simfunc, tfidf_path, term_colname, included_subreddits, outdir:Path, subreddit_names, nterms, topN=None, min_df=None, max_df=None):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    print(f"loading matrix: {week}")
    entries = pull_tfidf(infile = tfidf_path,
                         term_colname=term_colname,
                         included_subreddits=included_subreddits,
                         topN=topN,
                         week=week.isoformat(),
                         rescale_idf=False)
    tfidf_colname='tf_idf'
    # if the max subreddit id we found is less than the number of subreddit names then we have to fill in 0s
    mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)),shape=(nterms,subreddit_names.shape[0]))
    print('computing similarities')
    print(simfunc)
    sims = simfunc(mat)
    del mat
    sims = next(sims)[0]
    sims = pd.DataFrame(sims)
    sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
    sims['_subreddit'] = subreddit_names.subreddit.values
    outfile = str(Path(outdir) / str(week))
    write_weekly_similarities(outfile, sims, week, subreddit_names)
 def pull_weeks(batch):
    return set(batch.to_pandas()['week'])
 # This requires a prefit LSI model, since we shouldn't fit different LSI models for every week. 
 def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kwargs):
    print(args)
    print(kwargs)
    term_colname= kwargs.get('term_colname')
    # lsi_model = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI/1000_author_LSIMOD.pkl"
    lsi_model = pickle.load(open(lsi_model,'rb'))
    #simfunc = partial(lsi_column_similarities,n_components=n_components,random_state=random_state,algorithm='randomized',lsi_model=lsi_model)
    simfunc = partial(lsi_column_similarities,n_components=n_components,random_state=kwargs.get('random_state'),lsi_model=lsi_model)
    return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
 #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet')
 def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None,max_df=None):
    print(outfile)
    # do this step in parallel if we have the memory for it.
    # should be doable with pool.map
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(tfidf_path)
    # load subreddits + topN
    subreddit_names = df.select(['subreddit','subreddit_id']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id")
    nterms = df.select(f.max(f.col(term_colname + "_id")).alias('max')).collect()[0].max
    weeks = df.select(f.col("week")).distinct().toPandas().week.values
    spark.stop()
    print(f"computing weekly similarities")
    week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=None, subreddit_names=subreddit_names,nterms=nterms)
    for week in weeks:
        week_similarities_helper(week)
    # pool = Pool(cpu_count())
    # list(pool.imap(week_similarities_helper, weeks))
    # pool.close()
    #    with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
 def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500):
    return cosine_similarities_weekly(infile,
                                      outfile,
                                      'author',
                                      max_df,
                                      included_subreddits,
                                      topN,
                                      min_df=2
 )
 def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None):
        return cosine_similarities_weekly(infile,
                                          outfile,
                                          'term',
                                          min_df,
                                          max_df,
                                          included_subreddits,
                                          topN)
 def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None):
    return cosine_similarities_weekly_lsi(infile,
                                          outfile,
                                          'author',
                                          included_subreddits=included_subreddits,
                                          n_components=n_components,
                                          lsi_model=lsi_model
                                          )
 def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None):
        return cosine_similarities_weekly_lsi(infile,
                                              outfile,
                                              'term',
                                              included_subreddits=included_subreddits,
                                              n_components=n_components,
                                              lsi_model=lsi_model,
                                              )
 if __name__ == "__main__":
    fire.Fire({'authors':author_cosine_similarities_weekly,
               'terms':term_cosine_similarities_weekly,
               'authors-lsi':author_cosine_similarities_weekly_lsi,
               'terms-lsi':term_cosine_similarities_weekly_lsi
               })
--- a/start_spark_and_run.sh
+++ b/start_spark_and_run.sh
@ -0,0 +1,21 @@
 #!/usr/bin/env bash
 # Script to start a spark cluster and run a script on klone
 source $SPARK_CONF_DIR/spark-env.sh
 echo "#!/usr/bin/bash" > job_script.sh
 echo "source ~/.bashrc" >> job_script.sh
 echo "export PYSPARK_PYTHON=python3" >> job.script.sh
 echo "export JAVA_HOME=/gscratch/comdata/local/open-jdk" >> job.script.sh
 echo "export SPARK_CONF_DIR=/gscratch/comdata/local/spark_config" >> job.script.sh
 echo "echo \$(hostname)" >> job_script.sh
 echo "source $SPARK_CONF_DIR/spark-env.sh" >> job.script.sh
 echo "start_spark_cluster.sh" >> job_script.sh
 echo "spark-submit --verbose --master spark://\$(hostname):$SPARK_MASTER_PORT $2 ${@:3}" >> job_script.sh
 echo "stop-all.sh" >> job_script.sh
 #echo "singularity instance stop --all" >> job_script.sh
 chmod +x job_script.sh
 let "cpus = $1 * 40" 
 salloc -p compute-bigmem -A comdata --nodes=$1 --time=48:00:00 -c 40 --mem=362G --exclusive srun -n1 job_script.sh
--- a/start_spark_cluster.sh
+++ b/start_spark_cluster.sh
@ -0,0 +1,26 @@
 #!/usr/bin/env bash
 nodes="$(scontrol show hostnames)"
 export SPARK_MASTER_HOST=$(hostname)
 echo $SPARK_MASTER_HOST
 # singularity instance stop spark-boss
 # rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/spark-boss
 # for node in $nodes
 # dol
 #     echo $node
 #     ssh $node "singularity instance stop --all -F"
 # done
 # singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif spark-boss
 #apptainer exec /gscratch/comdata/users/nathante/containers/nathante.sif
 start-master.sh 
 for node in $nodes
 do
    # if [ "$node" != "$SPARK_BOSS" ]
    # then
    echo $node
    ssh -t $node start_spark_worker.sh $SPARK_MASTER_HOST
   # fi				
 done
--- a/start_spark_worker.sh
+++ b/start_spark_worker.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 # runs on worker node
 # instance_name=spark-worker-$(hostname)
 # echo $hostname
 # instance_url="instance://$instance_name"
 # singularity instance list
 # singularity instance stop -F "$instance_name"
 # singularity instance list
 # sleep 5
 # ls $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name
 # rm -r $HOME/.singularity/instances/sing/$(hostname)/nathante/$instance_name
 # singularity instance start /gscratch/comdata/users/nathante/cdsc_base.sif $instance_name
 source /gscratch/comdata/env/cdsc_klone_bashrc
 source $SPARK_CONF_DIR/spark-env.sh
 echo $(which python3)
 echo $PYSPARK_PYTHON
 echo "start-worker.sh spark://$1:$SPARK_MASTER_PORT"
 start-worker.sh spark://$1:$SPARK_MASTER_PORT
--- a/timeseries/init.py
+++ b/timeseries/init.py
@ -1,2 +0,0 @@
 from .choose_clusters import load_clusters, load_densities
 from .cluster_timeseries import build_cluster_timeseries
--- a/timeseries/choose_clusters.py
+++ b/timeseries/choose_clusters.py
@ -1,96 +0,0 @@
 from pyarrow import dataset as ds
 import numpy as np
 import pandas as pd
 import plotnine as pn
 random = np.random.RandomState(1968)
 def load_densities(term_density_file="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
                   author_density_file="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather"):
    term_density = pd.read_feather(term_density_file)
    author_density = pd.read_feather(author_density_file)
    term_density.rename({'overlap_density':'term_density','index':'subreddit'},axis='columns',inplace=True)
    author_density.rename({'overlap_density':'author_density','index':'subreddit'},axis='columns',inplace=True)
    density = term_density.merge(author_density,on='subreddit',how='inner')
    return density
 def load_clusters(term_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
                  author_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather"):
    term_clusters = pd.read_feather(term_clusters_file)
    author_clusters = pd.read_feather(author_clusters_file)
    # rename, join and return
    term_clusters.rename({'cluster':'term_cluster'},axis='columns',inplace=True)
    author_clusters.rename({'cluster':'author_cluster'},axis='columns',inplace=True)
    clusters = term_clusters.merge(author_clusters,on='subreddit',how='inner')
    return clusters
 if __name__ == '__main__':
    df = load_densities()
    cl = load_clusters()
    df['td_rank'] = df.term_density.rank()
    df['ad_rank'] = df.author_density.rank()
    df['td_percentile'] = df.td_rank / df.shape[0]
    df['ad_percentile'] = df.ad_rank / df.shape[0]
    df = df.merge(cl, on='subreddit',how='inner')
    term_cluster_density = df.groupby('term_cluster').agg({'td_rank':['mean','min','max'],
                                                         'ad_rank':['mean','min','max'],
                                                         'td_percentile':['mean','min','max'],
                                                           'ad_percentile':['mean','min','max'],
                                                           'subreddit':['count']})
    author_cluster_density = df.groupby('author_cluster').agg({'td_rank':['mean','min','max'],
                                                         'ad_rank':['mean','min','max'],
                                                         'td_percentile':['mean','min','max'],
                                                           'ad_percentile':['mean','min','max'],
                                                           'subreddit':['count']})
    # which clusters have the most term_density?
    term_cluster_density.iloc[term_cluster_density.td_rank['mean'].sort_values().index]
    # which clusters have the most author_density?
    term_cluster_density.iloc[term_cluster_density.ad_rank['mean'].sort_values(ascending=False).index].loc[term_cluster_density.subreddit['count'] >= 5][0:20]
    high_density_term_clusters = term_cluster_density.loc[(term_cluster_density.td_percentile['mean'] > 0.75) & (term_cluster_density.subreddit['count'] > 5)]
    # let's just use term density instead of author density for now. We can do a second batch with author density next.
    chosen_clusters = high_density_term_clusters.sample(3,random_state=random)
    cluster_info = df.loc[df.term_cluster.isin(chosen_clusters.index.values)]
    chosen_subreddits = cluster_info.subreddit.values
    dataset = ds.dataset("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",format='parquet')
    comments = dataset.to_table(filter=ds.field("subreddit").isin(chosen_subreddits),columns=['id','subreddit','author','CreatedAt'])
    comments = comments.to_pandas()
    comments['week'] = comments.CreatedAt.dt.date - pd.to_timedelta(comments['CreatedAt'].dt.dayofweek, unit='d')
    author_timeseries = comments.loc[:,['subreddit','author','week']].drop_duplicates().groupby(['subreddit','week']).count().reset_index()
    for clid in chosen_clusters.index.values:
        ts = pd.read_feather(f"data/ts_term_cluster_{clid}.feather")
        pn.options.figure_size = (11.7,8.27)
        p = pn.ggplot(ts)
        p = p + pn.geom_line(pn.aes('week','value',group='subreddit'))
        p = p + pn.facet_wrap('~ subreddit')
        p.save(f"plots/ts_term_cluster_{clid}.png")
        fig, ax = pyplot.subplots(figsize=(11.7,8.27))
        g = sns.FacetGrid(ts,row='subreddit')
        g.map_dataframe(sns.scatterplot,'week','value',data=ts,ax=ax)
--- a/timeseries/cluster_timeseries.py
+++ b/timeseries/cluster_timeseries.py
@ -1,37 +0,0 @@
 import pandas as pd
 import numpy as np
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from .choose_clusters import load_clusters, load_densities
 import fire
 from pathlib import Path
 def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
         author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather",
         term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
         author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather",
         output="data/subreddit_timeseries.parquet"):
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
    df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt")))
    # time of unique authors by series by week
    ts = df.select(['subreddit','week','author']).distinct().groupby(['subreddit','week']).count()
    ts = ts.repartition('subreddit')
    if term_densities_path is not None and author_densities_path is not None:
        densities = load_densities(term_densities_path, author_densities_path)
        spk_densities = spark.createDataFrame(densities)
        ts = ts.join(spk_densities, on='subreddit', how='inner')
    clusters = load_clusters(term_clusters_path, author_clusters_path)
    spk_clusters = spark.createDataFrame(clusters)
    ts = ts.join(spk_clusters, on='subreddit', how='inner')
    ts.write.parquet(output, mode='overwrite')
 if __name__ == "__main__":
    fire.Fire(build_cluster_timeseries)
--- a/tsne_subreddit_fit.feather
+++ b/tsne_subreddit_fit.feather
@ -1 +0,0 @@
 /annex/objects/SHA256E-s60874--d536adb0ec637fca262c4e1ec908dd8b4a5d1464047b583cd1a99cc6dba87191
--- a/visualization/Makefile
+++ b/visualization/Makefile
@ -1,11 +0,0 @@
 all: subreddit_author_tf_similarities_10000.html #comment_authors_10000.html
 # wang_tsne_10000.html
 # wang_tsne_10000.html:/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather tsne_vis.py
 # 	python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather --output=wang_tsne_10000.html
 # comment_authors_10000.html:/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather tsne_vis.py
 # 	python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather --output=comment_authors_10000.html
 subreddit_author_tf_similarities_10000.html:/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather tsne_vis.py
 	start_spark_and_run.sh 1 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather --output=subreddit_author_tf_similarities_10000.html
--- a/visualization/data/term_affinityprop_10000.feather
+++ b/visualization/data/term_affinityprop_10000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784
--- a/visualization/data/term_affinityprop_3000.feather
+++ b/visualization/data/term_affinityprop_3000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e
--- a/visualization/data/term_tsne_10000.feather
+++ b/visualization/data/term_tsne_10000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543
--- a/visualization/data/term_tsne_3000.feather
+++ b/visualization/data/term_tsne_3000.feather
@ -1 +0,0 @@
 ../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf
--- a/visualization/subreddit_author_tf_similarities_10000.html
+++ b/visualization/subreddit_author_tf_similarities_10000.html
--- a/visualization/subreddit_author_tf_similarities_10000_viewport.html
+++ b/visualization/subreddit_author_tf_similarities_10000_viewport.html
--- a/visualization/tsne_vis.py
+++ b/visualization/tsne_vis.py
@ -1,187 +0,0 @@
 import pyarrow
 import altair as alt
 alt.data_transformers.disable_max_rows()
 alt.data_transformers.enable('default')
 from sklearn.neighbors import NearestNeighbors
 import pandas as pd
 from numpy import random
 import fire
 import numpy as np
 def base_plot(plot_data):
 #    base = base.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')))
    cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))])
    #    subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit))
    cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ')
    # cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster')
    # cluster_select_and = cluster_click_select & cluster_select
    #
    #    subreddit_select = alt.selection_single(on='click',fields=['subreddit'],bind=subreddit_dropdown,name='subreddit_click')
    base_scale = alt.Scale(scheme={"name":'category10',
                                   "extent":[0,100],
                                   "count":10})
    color = alt.condition(cluster_click_select ,
                          alt.Color(field='color',type='nominal',scale=base_scale),
                          alt.value("lightgray"))
    base = alt.Chart(plot_data).mark_text().encode(
        alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
        alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
        color=color,
        text='subreddit')
    base = base.add_selection(cluster_click_select)
    return base
 def zoom_plot(plot_data):
    chart = base_plot(plot_data)
    chart = chart.interactive()
    chart = chart.properties(width=1275,height=800)
    return chart
 def viewport_plot(plot_data):
    selector1 = alt.selection_interval(encodings=['x','y'],init={'x':(-65,65),'y':(-65,65)})
    selectorx2 = alt.selection_interval(encodings=['x'],init={'x':(30,40)})
    selectory2 = alt.selection_interval(encodings=['y'],init={'y':(-20,0)})
    base = base_plot(plot_data)
    viewport = base.mark_point(fillOpacity=0.2,opacity=0.2).encode(
        alt.X('x',axis=alt.Axis(grid=False)),
        alt.Y('y',axis=alt.Axis(grid=False)),
    )
    viewport = viewport.properties(width=600,height=400)
    viewport1 = viewport.add_selection(selector1)
    viewport2 = viewport.encode(
        alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1)),
        alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selector1))
    )
    viewport2 = viewport2.add_selection(selectorx2)
    viewport2 = viewport2.add_selection(selectory2)
    sr = base.encode(alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectorx2)),
                     alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectory2))
    )
    sr = sr.properties(width=1275,height=600)
    chart = (viewport1 | viewport2) & sr
    return chart
 def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
    isolate_color = 101
    cluster_sizes = clusters.groupby('cluster').count()
    singletons = set(cluster_sizes.loc[cluster_sizes.subreddit == 1].reset_index().cluster)
    tsne_data = tsne_data.merge(clusters,on='subreddit')
    centroids = tsne_data.groupby('cluster').agg({'x':np.mean,'y':np.mean})
    color_ids = np.arange(n_colors)
    distances = np.empty(shape=(centroids.shape[0],centroids.shape[0]))
    groups = tsne_data.groupby('cluster')
    points = np.array(tsne_data.loc[:,['x','y']])
    centers = np.array(centroids.loc[:,['x','y']])
    # point x centroid
    point_center_distances = np.linalg.norm((points[:,None,:] - centers[None,:,:]),axis=-1)
    # distances is cluster x point
    for gid, group in groups:
        c_dists = point_center_distances[group.index.values,:].min(axis=0)
        distances[group.cluster.values[0],] = c_dists        
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(centroids) 
    # distances, indices = nbrs.kneighbors()
    nearest = distances.argpartition(n_neighbors,0)
    indices = nearest[:n_neighbors,:].T
    # neighbor_distances = np.copy(distances)
    # neighbor_distances.sort(0)
    # neighbor_distances = neighbor_distances[0:n_neighbors,:]
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors,metric='precomputed').fit(distances) 
    # distances, indices = nbrs.kneighbors()
    color_assignments = np.repeat(-1,len(centroids))
    for i in range(len(centroids)):
        if (centroids.iloc[i].name == -1) or (i in singletons):
            color_assignments[i] = isolate_color
        else:
            knn = indices[i]
            knn_colors = color_assignments[knn]
            available_colors = color_ids[list(set(color_ids) - set(knn_colors))]
            if(len(available_colors) > 0):
                color_assignments[i] = available_colors[0]
            else:
                raise Exception("Can't color this many neighbors with this many colors")
    centroids = centroids.reset_index()
    colors = centroids.loc[:,['cluster']]
    colors['color'] = color_assignments
    tsne_data = tsne_data.merge(colors,on='cluster')
    return(tsne_data)
 def build_visualization(tsne_data, clusters, output):
    # tsne_data = "/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather"
    # clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather"
    tsne_data = pd.read_feather(tsne_data)
    tsne_data = tsne_data.rename(columns={'_subreddit':'subreddit'})
    clusters = pd.read_feather(clusters)
    tsne_data = assign_cluster_colors(tsne_data,clusters,10,8)
    sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index()
    sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'})
    tsne_data = tsne_data.merge(sr_per_cluster,on='cluster')
    term_zoom_plot = zoom_plot(tsne_data)
    term_zoom_plot.save(output)
    term_viewport_plot = viewport_plot(tsne_data)
    term_viewport_plot.save(output.replace(".html","_viewport.html"))
 if __name__ == "__main__":
    fire.Fire(build_visualization)
 # commenter_data = pd.read_feather("tsne_author_fit.feather")
 # clusters = pd.read_feather('author_3000_clusters.feather')
 # commenter_data = assign_cluster_colors(commenter_data,clusters,10,8)
 # commenter_zoom_plot = zoom_plot(commenter_data)
 # commenter_viewport_plot = viewport_plot(commenter_data)
 # commenter_zoom_plot.save("subreddit_commenters_tsne_3000.html")
 # commenter_viewport_plot.save("subreddit_commenters_tsne_3000_viewport.html")
 # chart = chart.properties(width=10000,height=10000)
 # chart.save("test_tsne_whole.svg")
		`@ -1,2 +0,0 @@`
			`from .choose_clusters import load_clusters, load_densities`
			`from .cluster_timeseries import build_cluster_timeseries`
		`@ -1 +0,0 @@`
			`/annex/objects/SHA256E-s60874--d536adb0ec637fca262c4e1ec908dd8b4a5d1464047b583cd1a99cc6dba87191`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/Qk/wG/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784/SHA256E-s145210--14a2ad6660d1e4015437eff556ec349dd10a115a4f96594152a29e83d00aa784`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/w7/2f/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e/SHA256E-s44458--f1c5247775ecf06514a0ff9e523e944bc8fcd9d0fdb6f214cc1329b759d4354e`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/WX/v3/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543/SHA256E-s190874--c2aea719f989dde297ca5f13371e156693c574e44acd9a0e313e5e3a3ad4b543`
		`@ -1 +0,0 @@`
			`../../.git/annex/objects/mq/2z/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf/SHA256E-s58834--2e7b3ee11f47011fd9b34bddf8f1e788d35ab9c9e0bb6a1301b0b916135400cf`