refactor similarities to use submodule.

update pushshift dumps.
lsi support for weekly similarities
2022-01-19 15:05:49 -08:00 · 2021-12-10 21:23:32 -08:00 · 2021-08-11 22:48:33 -07:00 · 2021-08-03 15:13:39 -07:00 · 2021-08-03 15:13:21 -07:00 · 2021-08-03 15:06:48 -07:00
75 changed files with 3007 additions and 549 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
 [submodule "cdsc_ecology_utils"]
 	path = cdsc_ecology_utils
 	url = code:cdsc_ecology_utils
--- a/init.py
+++ b/init.py
@@ -0,0 +1,2 @@
 from timeseries import load_clusters, load_densities, build_cluster_timeseries
 from cdsc_ecology_utils import similarity_functions
--- a/author_cosine_similarity.py
+++ b/author_cosine_similarity.py
@@ -1,79 +0,0 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 import numpy as np
 import pyarrow
 import pandas as pd
 import fire
 from itertools import islice
 from pathlib import Path
 from similarities_helper import cosine_similarities
 spark = SparkSession.builder.getOrCreate()
 conf = spark.sparkContext.getConf()
 # outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
 def author_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500):
    '''
    Compute similarities between subreddits based on tfi-idf vectors of author comments
    included_subreddits : string
        Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
    similarity_threshold : double (default = 0)
        set > 0 for large numbers of subreddits to get an approximate solution using the DIMSUM algorithm
 https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get an exact solution using an O(N^2) algorithm.
    min_df : int (default = 0.1 * (number of included_subreddits)
         exclude terms that appear in fewer than this number of documents.
    outfile: string
         where to output csv and feather outputs
 '''
    print(outfile)
    tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet')
    if included_subreddits is None:
        included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
        included_subreddits = {s.strip('\n') for s in included_subreddits}
    else:
        included_subreddits = set(open(included_subreddits))
    sim_dist, tfidf = cosine_similarities(tfidf, 'author', min_df, included_subreddits, similarity_threshold)
    p = Path(outfile)
    output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
    output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
    output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
    sim_dist = sim_dist.entries.toDF()
    sim_dist = sim_dist.repartition(1)
    sim_dist.write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
    #instead of toLocalMatrix() why not read as entries and put strait into numpy
    sim_entries = pd.read_parquet(output_parquet)
    df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
    spark.stop()
    df['subreddit_id_new'] = df['subreddit_id_new'] - 1
    df = df.sort_values('subreddit_id_new').reset_index(drop=True)
    df = df.set_index('subreddit_id_new')
    similarities = sim_entries.join(df, on='i')
    similarities = similarities.rename(columns={'subreddit':"subreddit_i"})
    similarities = similarities.join(df, on='j')
    similarities = similarities.rename(columns={'subreddit':"subreddit_j"})
    similarities.to_feather(output_feather)
    similarities.to_csv(output_csv)
    return similarities
 if __name__ == '__main__':
    fire.Fire(author_cosine_similarities)
--- a/bots/good_bad_bot.py
+++ b/bots/good_bad_bot.py
@@ -0,0 +1,74 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.sql.types import FloatType
 import zlib
 def zlib_entropy_rate(s):
    sb = s.encode()
    if len(sb) == 0:
        return None
    else:
        return len(zlib.compress(s.encode(),level=6))/len(s.encode())
 zlib_entropy_rate_udf = f.udf(zlib_entropy_rate,FloatType())
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_author.parquet",compression='snappy')
 df = df.withColumn("saidbot",f.lower(f.col("body")).like("%bot%"))
 # df = df.filter(df.subreddit=='seattle')
 # df = df.cache()
 botreplies = df.filter(f.lower(df.body).rlike(".*[good|bad] bot.*"))
 botreplies = botreplies.select([f.col("parent_id").substr(4,100).alias("bot_comment_id"),f.lower(f.col("body")).alias("good_bad_bot"),f.col("link_id").alias("gbbb_link_id")])
 botreplies = botreplies.groupby(['bot_comment_id']).agg(f.count('good_bad_bot').alias("N_goodbad_votes"),
                                                        f.sum((f.lower(f.col('good_bad_bot')).like('%good bot%').astype("double"))).alias("n_good_votes"),
                                                        f.sum((f.lower(f.col('good_bad_bot')).like('%bad bot%').astype("double"))).alias("n_bad_votes"))
 comments_by_author = df.select(['author','id','saidbot']).groupBy('author').agg(f.count('id').alias("N_comments"),
                                                                                f.mean(f.col('saidbot').astype("double")).alias("prop_saidbot"),
                                                                                f.sum(f.col('saidbot').astype("double")).alias("n_saidbot"))
 # pd_comments_by_author = comments_by_author.toPandas()
 # pd_comments_by_author['frac'] = 500 / pd_comments_by_author['N_comments']
 # pd_comments_by_author.loc[pd_comments_by_author.frac > 1, 'frac'] = 1
 # fractions = pd_comments_by_author.loc[:,['author','frac']]
 # fractions = fractions.set_index('author').to_dict()['frac']
 # sampled_author_comments = df.sampleBy("author",fractions).groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
 df = df.withColumn("randn",f.randn(seed=1968))
 win = Window.partitionBy("author").orderBy("randn")
 df = df.withColumn("randRank",f.rank().over(win))
 sampled_author_comments = df.filter(f.col("randRank") <= 1000)
 sampled_author_comments = sampled_author_comments.groupBy('author').agg(f.concat_ws(" ", f.collect_list('body')).alias('comments'))
 author_entropy_rates = sampled_author_comments.select(['author',zlib_entropy_rate_udf(f.col('comments')).alias("entropy_rate")])
 parents = df.join(botreplies, on=df.id==botreplies.bot_comment_id,how='right_outer')
 win1 = Window.partitionBy("author")
 parents = parents.withColumn("first_bot_reply",f.min(f.col("CreatedAt")).over(win1))
 first_bot_reply = parents.filter(f.col("first_bot_reply")==f.col("CreatedAt"))
 first_bot_reply = first_bot_reply.withColumnRenamed("CreatedAt","FB_CreatedAt")
 first_bot_reply = first_bot_reply.withColumnRenamed("id","FB_id")
 comments_since_first_bot_reply = df.join(first_bot_reply,on = 'author',how='right_outer').filter(f.col("CreatedAt")>=f.col("first_bot_reply"))
 comments_since_first_bot_reply = comments_since_first_bot_reply.groupBy("author").agg(f.count("id").alias("N_comments_since_firstbot"))
 bots = parents.groupby(['author']).agg(f.sum('N_goodbad_votes').alias("N_goodbad_votes"),
                                          f.sum(f.col('n_good_votes')).alias("n_good_votes"),
                                          f.sum(f.col('n_bad_votes')).alias("n_bad_votes"),
                                          f.count(f.col('author')).alias("N_bot_posts"))
 bots = bots.join(comments_by_author,on="author",how='left_outer')
 bots = bots.join(comments_since_first_bot_reply,on="author",how='left_outer')
 bots = bots.join(author_entropy_rates,on='author',how='left_outer')
 bots = bots.orderBy("N_goodbad_votes",ascending=False)
 bots = bots.repartition(1)
 bots.write.parquet("/gscratch/comdata/output/reddit_good_bad_bot.parquet",mode='overwrite')
--- a/1
+++ b/1
--- a/clustering.py
+++ b/clustering.py
@@ -1,45 +0,0 @@
 import pandas as pd
 import numpy as np
 from sklearn.cluster import AffinityPropagation
 import fire
 def affinity_clustering(similarities, output, damping=0.5, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968):
    '''
    similarities: feather file with a dataframe of similarity scores
    preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
    '''
    df = pd.read_feather(similarities)
    n = df.shape[0]
    mat = np.array(df.drop('subreddit',1))
    mat[range(n),range(n)] = 1
    preference = np.quantile(mat,preference_quantile)
    clustering = AffinityPropagation(damping=damping,
                                     max_iter=max_iter,
                                     convergence_iter=convergence_iter,
                                     copy=False,
                                     preference=preference,
                                     affinity='precomputed',
                                     random_state=random_state).fit(mat)
    print(f"clustering took {clustering.n_iter_} iterations")
    clusters = clustering.labels_
    print(f"found {len(set(clusters))} clusters")
    cluster_data = pd.DataFrame({'subreddit': df.subreddit,'cluster':clustering.labels_})
    cluster_sizes = cluster_data.groupby("cluster").count()
    print(f"the largest cluster has {cluster_sizes.subreddit.max()} members")
    print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
    print(f"{(cluster_sizes.subreddit==1).sum()} clusters have 1 member")
    cluster_data.to_feather(output)
 if __name__ == "__main__":
    fire.Fire(affinity_clustering)
--- a/clustering/Makefile
+++ b/clustering/Makefile
@@ -0,0 +1,199 @@
 #srun_cdsc='srun -p comdata-int -A comdata --time=300:00:00 --time-min=00:15:00 --mem=100G --ntasks=1 --cpus-per-task=28'
 srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
 similarity_data=/gscratch/comdata/output/reddit_similarity
 clustering_data=/gscratch/comdata/output/reddit_clustering
 kmeans_selection_grid=--max_iters=[3000] --n_inits=[10] --n_clusters=[100,500,1000,1250,1500,1750,2000]
 hdbscan_selection_grid=--min_cluster_sizes=[2,3,4,5] --min_samples=[2,3,4,5] --cluster_selection_epsilons=[0,0.01,0.05,0.1,0.15,0.2] --cluster_selection_methods=[eom,leaf]
 affinity_selection_grid=--dampings=[0.5,0.6,0.7,0.8,0.95,0.97,0.99] --preference_quantiles=[0.1,0.3,0.5,0.7,0.9] --convergence_iters=[15]
 authors_10k_input=$(similarity_data)/subreddit_comment_authors_10k.feather
 authors_10k_input_lsi=$(similarity_data)/subreddit_comment_authors_10k_LSI
 authors_10k_output=$(clustering_data)/subreddit_comment_authors_10k
 authors_10k_output_lsi=$(clustering_data)/subreddit_comment_authors_10k_LSI
 authors_tf_10k_input=$(similarity_data)/subreddit_comment_authors-tf_10k.feather
 authors_tf_10k_input_lsi=$(similarity_data)/subreddit_comment_authors-tf_10k_LSI
 authors_tf_10k_output=$(clustering_data)/subreddit_comment_authors-tf_10k
 authors_tf_10k_output_lsi=$(clustering_data)/subreddit_comment_authors-tf_10k_LSI
 terms_10k_input=$(similarity_data)/subreddit_comment_terms_10k.feather
 terms_10k_input_lsi=$(similarity_data)/subreddit_comment_terms_10k_LSI
 terms_10k_output=$(clustering_data)/subreddit_comment_terms_10k
 terms_10k_output_lsi=$(clustering_data)/subreddit_comment_terms_10k_LSI
 all:terms_10k authors_10k authors_tf_10k terms_10k_lsi authors_10k_lsi authors_tf_10k_lsi
 terms_10k:${terms_10k_output}/kmeans/selection_data.csv ${terms_10k_output}/affinity/selection_data.csv ${terms_10k_output}/hdbscan/selection_data.csv
 authors_10k:${authors_10k_output}/kmeans/selection_data.csv ${authors_10k_output}/hdbscan/selection_data.csv ${authors_10k_output}/affinity/selection_data.csv
 authors_tf_10k:${authors_tf_10k_output}/kmeans/selection_data.csv ${authors_tf_10k_output}/hdbscan/selection_data.csv ${authors_tf_10k_output}/affinity/selection_data.csv
 terms_10k_lsi:${terms_10k_output_lsi}/kmeans/selection_data.csv ${terms_10k_output_lsi}/affinity/selection_data.csv ${terms_10k_output_lsi}/hdbscan/selection_data.csv
 authors_10k_lsi:${authors_10k_output_lsi}/kmeans/selection_data.csv ${authors_10k_output_lsi}/hdbscan/selection_data.csv ${authors_10k_output_lsi}/affinity/selection_data.csv
 authors_tf_10k_lsi:${authors_tf_10k_output_lsi}/kmeans/selection_data.csv ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 ${authors_10k_output}/kmeans/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/kmeans --savefile=${authors_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) 
 ${terms_10k_output}/kmeans/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/kmeans  --savefile=${terms_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) 
 ${authors_tf_10k_output}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/kmeans --savefile=${authors_tf_10k_output}/kmeans/selection_data.csv $(kmeans_selection_grid) 
 ${authors_10k_output}/affinity/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/affinity --savefile=${authors_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) 
 ${terms_10k_output}/affinity/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/affinity  --savefile=${terms_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) 
 ${authors_tf_10k_output}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/affinity --savefile=${authors_tf_10k_output}/affinity/selection_data.csv $(affinity_selection_grid) 
 ${authors_10k_output}/hdbscan/selection_data.csv:selection.py ${authors_10k_input} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_10k_input} --outpath=${authors_10k_output}/hdbscan --savefile=${authors_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) 
 ${terms_10k_output}/hdbscan/selection_data.csv:selection.py ${terms_10k_input} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${terms_10k_input} --outpath=${terms_10k_output}/hdbscan  --savefile=${terms_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) 
 ${authors_tf_10k_output}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering.py --inpath=${authors_tf_10k_input} --outpath=${authors_tf_10k_output}/hdbscan --savefile=${authors_tf_10k_output}/hdbscan/selection_data.csv $(hdbscan_selection_grid) 
 ## LSI Models
 ${authors_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/kmeans --savefile=${authors_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
 ${terms_10k_output_lsi}/kmeans/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/kmeans  --savefile=${terms_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
 ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py kmeans_clustering.py
 	$(srun_singularity) python3 kmeans_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/kmeans --savefile=${authors_tf_10k_output_lsi}/kmeans/selection_data.csv $(kmeans_selection_grid)
 ${authors_10k_output_lsi}/affinity/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/affinity --savefile=${authors_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
 ${terms_10k_output_lsi}/affinity/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/affinity  --savefile=${terms_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
 ${authors_tf_10k_output_lsi}/affinity/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py affinity_clustering.py
 	$(srun_singularity) python3 affinity_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/affinity --savefile=${authors_tf_10k_output_lsi}/affinity/selection_data.csv $(affinity_selection_grid)
 ${authors_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${authors_10k_input_lsi} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_10k_input_lsi} --outpath=${authors_10k_output_lsi}/hdbscan --savefile=${authors_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
 ${terms_10k_output_lsi}/hdbscan/selection_data.csv:selection.py ${terms_10k_input_lsi} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${terms_10k_input_lsi} --outpath=${terms_10k_output_lsi}/hdbscan  --savefile=${terms_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
 ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv:clustering.py ${authors_tf_10k_input_lsi} clustering_base.py hdbscan_clustering.py
 	$(srun_singularity) python3 hdbscan_clustering_lsi.py --inpath=${authors_tf_10k_input_lsi} --outpath=${authors_tf_10k_output_lsi}/hdbscan --savefile=${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv $(hdbscan_selection_grid)
 ${terms_10k_output_lsi}/best_hdbscan.feather:${terms_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
 	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
 ${authors_tf_10k_output_lsi}/best_hdbscan.feather:${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv pick_best_clustering.py
 	$(srun_singularity) python3 pick_best_clustering.py $< $@ --min_clusters=50 --max_isolates=5000 --min_cluster_size=2
 clean_affinity:
 	rm -f ${authors_10k_output}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
 	rm -f ${terms_10k_output}/affinity/selection_data.csv
 clean_kmeans:
 	rm -f ${authors_10k_output}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output}/kmeans/selection_data.csv
 clean_hdbscan:
 	rm -f ${authors_10k_output}/hdbscan/selection_data.csv
 	rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
 	rm -f ${terms_10k_output}/hdbscan/selection_data.csv
 clean_authors:
 	rm -f ${authors_10k_output}/affinity/selection_data.csv
 	rm -f ${authors_10k_output}/kmeans/selection_data.csv
 	rm -f ${authors_10k_output}/hdbscan/selection_data.csv
 clean_authors_tf:
 	rm -f ${authors_tf_10k_output}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output}/hdbscan/selection_data.csv
 clean_terms:
 	rm -f ${terms_10k_output}/affinity/selection_data.csv
 	rm -f ${terms_10k_output}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output}/hdbscan/selection_data.csv
 clean_lsi_affinity:
 	rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
 clean_lsi_kmeans:
 	rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
 clean_lsi_hdbscan:
 	rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
 clean_lsi_authors:
 	rm -f ${authors_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_10k_output_lsi}/hdbscan/selection_data.csv
 clean_lsi_authors_tf:
 	rm -f ${authors_tf_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${authors_tf_10k_output_lsi}/hdbscan/selection_data.csv
 clean_lsi_terms:
 	rm -f ${terms_10k_output_lsi}/affinity/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/kmeans/selection_data.csv
 	rm -f ${terms_10k_output_lsi}/hdbscan/selection_data.csv
 clean: clean_affinity clean_kmeans clean_hdbscan
 PHONY: clean clean_affinity clean_kmeans clean_hdbscan clean_authors clean_authors_tf clean_terms terms_10k authors_10k authors_tf_10k
 # $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_authors_30k.feather clustering.py
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors_30k.feather $(clustering_data)/subreddit_comment_authors_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_authors_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS:selection.py $(similarity_data)/subreddit_comment_terms_30k.feather clustering.py
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_terms_30k.feather $(clustering_data)/subreddit_comment_terms_30k $(selection_grid) -J 10 && touch $(clustering_data)/subreddit_comment_terms_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS:clustering.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather
 # 	$(srun_singularity) python3 selection.py $(similarity_data)/subreddit_comment_authors-tf_30k.feather $(clustering_data)/subreddit_comment_authors-tf_30k $(selection_grid) -J 8 && touch $(clustering_data)/subreddit_authors-tf_similarities_30k.feather/SUCCESS
 # $(clustering_data)/subreddit_comment_authors_100k.feather:clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather
 # 	 $(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_authors_100k.feather $(clustering_data)/subreddit_comment_authors_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.85 --damping=0.85
 # $(clustering_data)/comment_terms_100k.feather:clustering.py $(similarity_data)/subreddit_comment_terms_100k.feather
 # 	$(srun_singularity) python3 clustering.py $(similarity_data)/comment_terms_10000.feather $(clustering_data)/comment_terms_10000.feather ---max_iter=1000 --convergence_iter=15 --preference_quantile=0.9 --damping=0.5
 # $(clustering_data)/subreddit_comment_author-tf_100k.feather:clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.feather
 # 	$(srun_singularity) python3 clustering.py $(similarity_data)/subreddit_comment_author-tf_100k.parquet $(clustering_data)/subreddit_comment_author-tf_100k.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.5 --damping=0.85
 # it's pretty difficult to get a result that isn't one huge megacluster. A sign that it's bullcrap
 # /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
 # 	./clustering.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather ---max_iter=400 --convergence_iter=15 --preference_quantile=0.9 --damping=0.85
 # /gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
 # 	start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet --output=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather
 # /gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather:fit_tsne.py /gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather
 # 	python3 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather --output=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather
 # /gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather:clustering.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
 # #	$srun_cdsc python3
 # 	start_spark_and_run.sh 1 fit_tsne.py --similarities=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --output=/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather
--- a/clustering/affinity/subreddit_comment_authors_10000_a.feather
+++ b/clustering/affinity/subreddit_comment_authors_10000_a.feather
--- a/clustering/affinity_clustering.py
+++ b/clustering/affinity_clustering.py
@@ -0,0 +1,129 @@
 from sklearn.cluster import AffinityPropagation
 from dataclasses import dataclass
 from clustering_base import clustering_result, clustering_job
 from grid_sweep import grid_sweep
 from pathlib import Path
 from itertools import product, starmap
 import fire
 import sys
 import numpy as np
 # silhouette is the only one that doesn't need the feature matrix. So it's probably the only one that's worth trying. 
@dataclass
 class affinity_clustering_result(clustering_result):
    damping:float
    convergence_iter:int
    preference_quantile:float
    preference:float
    max_iter:int
 class affinity_job(clustering_job):
    def __init__(self, infile, outpath, name, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
        super().__init__(infile,
                         outpath,
                         name,
                         call=self._affinity_clustering,
                         preference_quantile=preference_quantile,
                         damping=damping,
                         max_iter=max_iter,
                         convergence_iter=convergence_iter,
                         random_state=1968,
                         verbose=verbose)
        self.damping=damping
        self.max_iter=max_iter
        self.convergence_iter=convergence_iter
        self.preference_quantile=preference_quantile
    def _affinity_clustering(self, mat, preference_quantile, *args, **kwargs):
        mat = 1-mat
        preference = np.quantile(mat, preference_quantile)
        self.preference = preference
        print(f"preference is {preference}")
        print("data loaded")
        sys.stdout.flush()
        clustering = AffinityPropagation(*args,
                                         preference=preference,
                                         affinity='precomputed',
                                         copy=False,
                                         **kwargs).fit(mat)
        return clustering
    def get_info(self):
        result = super().get_info()
        self.result=affinity_clustering_result(**result.__dict__,
                                               damping=self.damping,
                                               max_iter=self.max_iter,
                                               convergence_iter=self.convergence_iter,
                                               preference_quantile=self.preference_quantile,
                                               preference=self.preference)
        return self.result
 class affinity_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):
        super().__init__(affinity_job,
                         _afffinity_grid_sweep,
                         inpath,
                         outpath,
                         self.namer,
                         *args,
                         **kwargs)
    def namer(self,
              damping,
              max_iter,
              convergence_iter,
              preference_quantile):
        return f"damp-{damping}_maxit-{max_iter}_convit-{convergence_iter}_prefq-{preference_quantile}"
 def run_affinity_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5],n_cores=10):
    """Run affinity clustering once or more with different parameters.
    Usage:
    affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv>
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. 
    preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
    convergence_iters:one or more integers of number of iterations without improvement before stopping.
    max_iters: one or more numbers of different maximum interations.
    """
    obj = affinity_grid_sweep(inpath,
                         outpath,
                         map(float,dampings),
                         map(int,max_iters),
                         map(int,convergence_iters),
                         map(float,preference_quantiles))
    obj.run(n_cores)
    obj.save(savefile)
 def test_select_affinity_clustering():
    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
    #                           "test_hdbscan_author30k",
    #                           min_cluster_sizes=[2],
    #                           min_samples=[1,2],
    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
    #                           cluster_selection_methods=['eom','leaf'],
    #                           lsi_dimensions='all')
    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
    outpath = "test_affinity";
    dampings=[0.8,0.9]
    max_iters=[100000]
    convergence_iters=[15]
    preference_quantiles=[0.5,0.7]
    gs = affinity_lsi_grid_sweep(inpath, 'all', outpath, dampings, max_iters, convergence_iters, preference_quantiles)
    gs.run(20)
    gs.save("test_affinity/lsi_sweep.csv")
 if __name__ == "__main__":
    fire.Fire(run_affinity_grid_sweep)
--- a/clustering/affinity_clustering_lsi.py
+++ b/clustering/affinity_clustering_lsi.py
@@ -0,0 +1,99 @@
 import fire
 from affinity_clustering import affinity_clustering_result, affinity_job, affinity_grid_sweep
 from grid_sweep import grid_sweep
 from lsi_base import lsi_result_mixin, lsi_grid_sweep, lsi_mixin
 from dataclasses import dataclass
@dataclass
 class affinity_clustering_result_lsi(affinity_clustering_result, lsi_result_mixin):
    pass
 class affinity_lsi_job(affinity_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        result = super().get_info()
        self.result = affinity_clustering_result_lsi(**result.__dict__,
                                                     lsi_dimensions=self.lsi_dims)
        return self.result
 class affinity_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 dampings=[0.9],
                 max_iters=[10000],
                 convergence_iters=[30],
                 preference_quantiles=[0.5]):
        super().__init__(affinity_lsi_job,
                         _affinity_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         dampings,
                         max_iters,
                         convergence_iters,
                         preference_quantiles)
 class _affinity_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):
        self.lsi_dim = lsi_dim
        self.jobtype = affinity_lsi_job
        super().__init__(self.jobtype,
                         inpath,
                         outpath,
                         self.namer,
                         [self.lsi_dim],
                         *args,
                         **kwargs)
    def namer(self, *args, **kwargs):
        s = affinity_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 def run_affinity_lsi_grid_sweep(savefile, inpath, outpath, dampings=[0.8], max_iters=[3000], convergence_iters=[30], preference_quantiles=[0.5], lsi_dimensions='all',n_cores=30):
    """Run affinity clustering once or more with different parameters.
    Usage:
    affinity_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --max_iters=<csv> --dampings=<csv> --preference_quantiles=<csv> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    dampings:one or more numbers in [0.5, 1). damping parameter in affinity propagatin clustering. 
    preference_quantiles:one or more numbers in (0,1) for selecting the 'preference' parameter.
    convergence_iters:one or more integers of number of iterations without improvement before stopping.
    max_iters: one or more numbers of different maximum interations.
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    """
    obj = affinity_lsi_grid_sweep(inpath,
                            lsi_dimensions,
                            outpath,
                            map(float,dampings),
                            map(int,max_iters),
                            map(int,convergence_iters),
                            map(float,preference_quantiles))
    obj.run(n_cores)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_affinity_lsi_grid_sweep)
--- a/clustering/clustering.py
+++ b/clustering/clustering.py
@@ -0,0 +1,52 @@
 #!/usr/bin/env python3
 # TODO: replace prints with logging.
 import sys
 import pandas as pd
 import numpy as np
 from sklearn.cluster import AffinityPropagation
 import fire
 from pathlib import Path
 from multiprocessing import cpu_count
 from dataclasses import dataclass
 from clustering_base import sim_to_dist, process_clustering_result, clustering_result, read_similarity_mat
 def affinity_clustering(similarities, output, *args, **kwargs):
    subreddits, mat = read_similarity_mat(similarities)
    clustering = _affinity_clustering(mat, *args, **kwargs)
    cluster_data = process_clustering_result(clustering, subreddits)
    cluster_data['algorithm'] = 'affinity'
    return(cluster_data)
 def _affinity_clustering(mat, subreddits, output, damping=0.9, max_iter=100000, convergence_iter=30, preference_quantile=0.5, random_state=1968, verbose=True):
    '''
    similarities: matrix of similarity scores
    preference_quantile: parameter controlling how many clusters to make. higher values = more clusters. 0.85 is a good value with 3000 subreddits.
    damping: parameter controlling how iterations are merged. Higher values make convergence faster and more dependable. 0.85 is a good value for the 10000 subreddits by author. 
    '''
    print(f"damping:{damping}; convergenceIter:{convergence_iter}; preferenceQuantile:{preference_quantile}")
    preference = np.quantile(mat,preference_quantile)
    print(f"preference is {preference}")
    print("data loaded")
    sys.stdout.flush()
    clustering = AffinityPropagation(damping=damping,
                                     max_iter=max_iter,
                                     convergence_iter=convergence_iter,
                                     copy=False,
                                     preference=preference,
                                     affinity='precomputed',
                                     verbose=verbose,
                                     random_state=random_state).fit(mat)
    cluster_data = process_clustering_result(clustering, subreddits)
    output = Path(output)
    output.parent.mkdir(parents=True,exist_ok=True)
    cluster_data.to_feather(output)
    print(f"saved {output}")
    return clustering
 if __name__ == "__main__":
    fire.Fire(affinity_clustering)
--- a/clustering/clustering_base.py
+++ b/clustering/clustering_base.py
@@ -0,0 +1,105 @@
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from dataclasses import dataclass
 from sklearn.metrics import silhouette_score, silhouette_samples
 from collections import Counter
 # this is meant to be an interface, not created directly
 class clustering_job:
    def __init__(self, infile, outpath, name, call, *args, **kwargs):
        self.outpath = Path(outpath)
        self.call = call
        self.args = args
        self.kwargs = kwargs
        self.infile = Path(infile)
        self.name = name
        self.hasrun = False
    def run(self):
        self.subreddits, self.mat = self.read_distance_mat(self.infile)
        self.clustering = self.call(self.mat, *self.args, **self.kwargs)
        self.cluster_data = self.process_clustering(self.clustering, self.subreddits)
        self.score = self.silhouette()
        self.outpath.mkdir(parents=True, exist_ok=True)
        self.cluster_data.to_feather(self.outpath/(self.name + ".feather"))
        self.hasrun = True
    def get_info(self):
        if not self.hasrun:
            self.run()
        self.result = clustering_result(outpath=str(self.outpath.resolve()),
                                        silhouette_score=self.score,
                                        name=self.name,
                                        n_clusters=self.n_clusters,
                                        n_isolates=self.n_isolates,
                                        silhouette_samples = self.silsampout
                                        )
        return self.result
    def silhouette(self):
        counts = Counter(self.clustering.labels_)
        singletons = [key for key, value in counts.items() if value == 1]
        isolates = (self.clustering.labels_ == -1) | (np.isin(self.clustering.labels_,np.array(singletons)))
        scoremat = self.mat[~isolates][:,~isolates]
        if self.n_clusters > 1:
            score = silhouette_score(scoremat, self.clustering.labels_[~isolates], metric='precomputed')
            silhouette_samp = silhouette_samples(self.mat, self.clustering.labels_, metric='precomputed')
            silhouette_samp = pd.DataFrame({'subreddit':self.subreddits,'score':silhouette_samp})
            self.outpath.mkdir(parents=True, exist_ok=True)
            silsampout = self.outpath / ("silhouette_samples-" + self.name +  ".feather")
            self.silsampout = silsampout.resolve()
            silhouette_samp.to_feather(self.silsampout)
        else:
            score = None
            self.silsampout = None
        return score
    def read_distance_mat(self, similarities, use_threads=True):
        df = pd.read_feather(similarities, use_threads=use_threads)
        mat = np.array(df.drop('_subreddit',1))
        n = mat.shape[0]
        mat[range(n),range(n)] = 1
        return (df._subreddit,1-mat)
    def process_clustering(self, clustering, subreddits):
        if hasattr(clustering,'n_iter_'):
            print(f"clustering took {clustering.n_iter_} iterations")
        clusters = clustering.labels_
        self.n_clusters = len(set(clusters))
        print(f"found {self.n_clusters} clusters")
        cluster_data = pd.DataFrame({'subreddit': subreddits,'cluster':clustering.labels_})
        cluster_sizes = cluster_data.groupby("cluster").count().reset_index()
        print(f"the largest cluster has {cluster_sizes.loc[cluster_sizes.cluster!=-1].subreddit.max()} members")
        print(f"the median cluster has {cluster_sizes.subreddit.median()} members")
        n_isolates1 = (cluster_sizes.subreddit==1).sum()
        print(f"{n_isolates1} clusters have 1 member")
        n_isolates2 = cluster_sizes.loc[cluster_sizes.cluster==-1,:]['subreddit'].to_list()
        if len(n_isolates2) > 0:
            n_isloates2 = n_isolates2[0]
        print(f"{n_isolates2} subreddits are in cluster -1",flush=True)
        if n_isolates1 == 0:
            self.n_isolates = n_isolates2
        else:
            self.n_isolates = n_isolates1
        return cluster_data
@dataclass
 class clustering_result:
    outpath:Path
    silhouette_score:float
    name:str
    n_clusters:int
    n_isolates:int
    silhouette_samples:str
--- a/clustering/fit_tsne.py
+++ b/clustering/fit_tsne.py
@@ -5,7 +5,7 @@ from numpy import random
 import numpy as np
 from sklearn.manifold import TSNE
-similarities = "term_similarities_10000.feather"
+similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet"
 def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=10000, early_exaggeration=20):
    '''
@@ -17,7 +17,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000
    df = pd.read_feather(similarities)
    n = df.shape[0]
-    mat = np.array(df.drop('subreddit',1),dtype=np.float64)
+    mat = np.array(df.drop('_subreddit',1),dtype=np.float64)
    mat[range(n),range(n)] = 1
    mat[mat > 1] = 1
    dist = 2*np.arccos(mat)/np.pi
@@ -26,7 +26,7 @@ def fit_tsne(similarities, output, learning_rate=750, perplexity=50, n_iter=1000
    tsne_fit_whole = tsne_fit_model.fit_transform(dist)
-    plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], 'subreddit':df.subreddit})
+    plot_data = pd.DataFrame({'x':tsne_fit_whole[:,0],'y':tsne_fit_whole[:,1], '_subreddit':df['_subreddit']})
    plot_data.to_feather(output)
--- a/clustering/grid_sweep.py
+++ b/clustering/grid_sweep.py
@@ -0,0 +1,33 @@
 from pathlib import Path
 from multiprocessing import Pool, cpu_count
 from itertools import product, chain
 import pandas as pd
 class grid_sweep:
    def __init__(self, jobtype, inpath, outpath, namer, *args):
        self.jobtype = jobtype
        self.namer = namer
        print(*args)
        grid = list(product(*args))
        inpath = Path(inpath)
        outpath = Path(outpath)
        self.hasrun = False
        self.grid = [(inpath,outpath,namer(*g)) + g for g in grid]
        self.jobs = [jobtype(*g) for g in self.grid]
    def run(self, cores=20):
        if cores is not None and cores > 1:
            with Pool(cores) as pool:
                infos = pool.map(self.jobtype.get_info, self.jobs)
        else:
            infos = map(self.jobtype.get_info, self.jobs)
        self.infos = pd.DataFrame(infos)
        self.hasrun = True
    def save(self, outcsv):
        if not self.hasrun:
            self.run()
        outcsv = Path(outcsv)
        outcsv.parent.mkdir(parents=True, exist_ok=True)
        self.infos.to_csv(outcsv)
--- a/clustering/hdbscan_clustering.py
+++ b/clustering/hdbscan_clustering.py
@@ -0,0 +1,159 @@
 from clustering_base import clustering_result, clustering_job
 from grid_sweep import grid_sweep
 from dataclasses import dataclass
 import hdbscan
 from sklearn.neighbors import NearestNeighbors
 import plotnine as pn
 import numpy as np
 from itertools import product, starmap, chain
 import pandas as pd
 from multiprocessing import cpu_count
 import fire
 def test_select_hdbscan_clustering():
    # select_hdbscan_clustering("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_30k_LSI",
    #                           "test_hdbscan_author30k",
    #                           min_cluster_sizes=[2],
    #                           min_samples=[1,2],
    #                           cluster_selection_epsilons=[0,0.05,0.1,0.15],
    #                           cluster_selection_methods=['eom','leaf'],
    #                           lsi_dimensions='all')
    inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_authors_compex_LSI"
    outpath = "test_hdbscan";
    min_cluster_sizes=[2,3,4];
    min_samples=[1,2,3];
    cluster_selection_epsilons=[0,0.1,0.3,0.5];
    cluster_selection_methods=[1];
    lsi_dimensions='all'
    gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
    gs.run(20)
    gs.save("test_hdbscan/lsi_sweep.csv")
    # job1 = hdbscan_lsi_job(infile=inpath, outpath=outpath, name="test", lsi_dims=500, min_cluster_size=2, min_samples=1,cluster_selection_epsilon=0,cluster_selection_method='eom')
    # job1.run()
    # print(job1.get_info())
    # df = pd.read_csv("test_hdbscan/selection_data.csv")
    # test_select_hdbscan_clustering()
    # check_clusters = pd.read_feather("test_hdbscan/500_2_2_0.1_eom.feather")
    # silscores = pd.read_feather("test_hdbscan/silhouette_samples500_2_2_0.1_eom.feather")
    # c = check_clusters.merge(silscores,on='subreddit')#    fire.Fire(select_hdbscan_clustering)
 class hdbscan_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):
        super().__init__(hdbscan_job, inpath, outpath, self.namer, *args, **kwargs)
    def namer(self,
              min_cluster_size,
              min_samples,
              cluster_selection_epsilon,
              cluster_selection_method):
        return f"mcs-{min_cluster_size}_ms-{min_samples}_cse-{cluster_selection_epsilon}_csm-{cluster_selection_method}"
@dataclass
 class hdbscan_clustering_result(clustering_result):
    min_cluster_size:int
    min_samples:int
    cluster_selection_epsilon:float
    cluster_selection_method:str
 class hdbscan_job(clustering_job):
    def __init__(self, infile, outpath, name, min_cluster_size=2, min_samples=1, cluster_selection_epsilon=0, cluster_selection_method='eom'):
        super().__init__(infile,
                         outpath,
                         name,
                         call=hdbscan_job._hdbscan_clustering,
                         min_cluster_size=min_cluster_size,
                         min_samples=min_samples,
                         cluster_selection_epsilon=cluster_selection_epsilon,
                         cluster_selection_method=cluster_selection_method
                         )
        self.min_cluster_size = min_cluster_size
        self.min_samples = min_samples
        self.cluster_selection_epsilon = cluster_selection_epsilon
        self.cluster_selection_method = cluster_selection_method
 #        self.mat = 1 - self.mat
    def _hdbscan_clustering(mat, *args, **kwargs):
        print(f"running hdbscan clustering. args:{args}. kwargs:{kwargs}")
        print(mat)
        clusterer = hdbscan.HDBSCAN(metric='precomputed',
                                    core_dist_n_jobs=cpu_count(),
                                    *args,
                                    **kwargs,
                                    )
        clustering = clusterer.fit(mat.astype('double'))
        return(clustering)
    def get_info(self):
        result = super().get_info()
        self.result = hdbscan_clustering_result(**result.__dict__,
                                                min_cluster_size=self.min_cluster_size,
                                                min_samples=self.min_samples,
                                                cluster_selection_epsilon=self.cluster_selection_epsilon,
                                                cluster_selection_method=self.cluster_selection_method)
        return self.result
 def run_hdbscan_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=['eom']):
    """Run hdbscan clustering once or more with different parameters.
    Usage:
    hdbscan_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=<csv "eom"|"leaf">
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    min_cluster_sizes: one or more integers indicating the minumum cluster size
    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
    cluster_selection_epsilon: one or more similarity thresholds for transition from dbscan to hdbscan
    cluster_selection_method: "eom" or "leaf" eom gives larger clusters. 
    """    
    obj = hdbscan_grid_sweep(inpath,
                             outpath,
                             map(int,min_cluster_sizes),
                             map(int,min_samples),
                             map(float,cluster_selection_epsilons),
                             cluster_selection_methods)
    obj.run()
    obj.save(savefile)
 def KNN_distances_plot(mat,outname,k=2):
    nbrs = NearestNeighbors(n_neighbors=k,algorithm='auto',metric='precomputed').fit(mat)
    distances, indices = nbrs.kneighbors(mat)
    d2 = distances[:,-1]
    df = pd.DataFrame({'dist':d2})
    df = df.sort_values("dist",ascending=False)
    df['idx'] = np.arange(0,d2.shape[0]) + 1
    p = pn.qplot(x='idx',y='dist',data=df,geom='line') + pn.scales.scale_y_continuous(minor_breaks = np.arange(0,50)/50,
                                                                                      breaks = np.arange(0,10)/10)
    p.save(outname,width=16,height=10)
 def make_KNN_plots():
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='terms_knn_dist2.png')
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors_knn_dist2.png')
    similarities = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k.feather"
    subreddits, mat = read_similarity_mat(similarities)
    mat = sim_to_dist(mat)
    KNN_distances_plot(mat,k=2,outname='authors-tf_knn_dist2.png')
 if __name__ == "__main__":
    fire.Fire(run_hdbscan_grid_sweep)
 #    test_select_hdbscan_clustering()
    #fire.Fire(select_hdbscan_clustering)  
--- a/clustering/hdbscan_clustering_lsi.py
+++ b/clustering/hdbscan_clustering_lsi.py
@@ -0,0 +1,101 @@
 from hdbscan_clustering import hdbscan_job, hdbscan_grid_sweep, hdbscan_clustering_result
 from lsi_base import lsi_grid_sweep, lsi_mixin, lsi_result_mixin
 from grid_sweep import grid_sweep
 import fire
 from dataclasses import dataclass
@dataclass
 class hdbscan_clustering_result_lsi(hdbscan_clustering_result, lsi_result_mixin):
    pass 
 class hdbscan_lsi_job(hdbscan_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(
                         infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        partial_result = super().get_info()
        self.result = hdbscan_clustering_result_lsi(**partial_result.__dict__,
                                                    lsi_dimensions=self.lsi_dims)
        return self.result
 class hdbscan_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 min_cluster_sizes,
                 min_samples,
                 cluster_selection_epsilons,
                 cluster_selection_methods
                 ):
        super().__init__(hdbscan_lsi_job,
                         _hdbscan_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         min_cluster_sizes,
                         min_samples,
                         cluster_selection_epsilons,
                         cluster_selection_methods)
 class _hdbscan_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):
        print(args)
        print(kwargs)
        self.lsi_dim = lsi_dim
        self.jobtype = hdbscan_lsi_job
        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
    def namer(self, *args, **kwargs):
        s = hdbscan_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 def run_hdbscan_lsi_grid_sweep(savefile, inpath, outpath,  min_cluster_sizes=[2], min_samples=[1], cluster_selection_epsilons=[0], cluster_selection_methods=[1],lsi_dimensions='all'):
    """Run hdbscan clustering once or more with different parameters.
    Usage:
    hdbscan_clustering_lsi --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --min_cluster_sizes=<csv> --min_samples=<csv> --cluster_selection_epsilons=<csv> --cluster_selection_methods=[eom]> --lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit clusterings.
    min_cluster_sizes: one or more integers indicating the minumum cluster size
    min_samples: one ore more integers indicating the minimum number of samples used in the algorithm
    cluster_selection_epsilons: one or more similarity thresholds for transition from dbscan to hdbscan
    cluster_selection_methods: one or more of "eom" or "leaf" eom gives larger clusters. 
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    """    
    obj = hdbscan_lsi_grid_sweep(inpath,
                                 lsi_dimensions,
                                 outpath,
                                 list(map(int,min_cluster_sizes)),
                                 list(map(int,min_samples)),
                                 list(map(float,cluster_selection_epsilons)),
                                 cluster_selection_methods)
    obj.run(10)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_hdbscan_lsi_grid_sweep)
--- a/clustering/kmeans_clustering.py
+++ b/clustering/kmeans_clustering.py
@@ -0,0 +1,105 @@
 from sklearn.cluster import KMeans
 import fire
 from pathlib import Path
 from dataclasses import dataclass
 from clustering_base import clustering_result, clustering_job
 from grid_sweep import grid_sweep
@dataclass
 class kmeans_clustering_result(clustering_result):
    n_clusters:int
    n_init:int
    max_iter:int
 class kmeans_job(clustering_job):
    def __init__(self, infile, outpath, name, n_clusters, n_init=10, max_iter=100000, random_state=1968, verbose=True):
        super().__init__(infile,
                         outpath,
                         name,
                         call=kmeans_job._kmeans_clustering,
                         n_clusters=n_clusters,
                         n_init=n_init,
                         max_iter=max_iter,
                         random_state=random_state,
                         verbose=verbose)
        self.n_clusters=n_clusters
        self.n_init=n_init
        self.max_iter=max_iter
    def _kmeans_clustering(mat, *args, **kwargs):
        clustering = KMeans(*args,
                            **kwargs,
                            ).fit(mat)
        return clustering
    def get_info(self):
        result = super().get_info()
        self.result = kmeans_clustering_result(**result.__dict__,
                                               n_init=self.n_init,
                                               max_iter=self.max_iter)
        return self.result
 class kmeans_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 *args,
                 **kwargs):
        super().__init__(kmeans_job, inpath, outpath, self.namer, *args, **kwargs)
    def namer(self,
             n_clusters,
             n_init,
             max_iter):
        return f"nclusters-{n_clusters}_nit-{n_init}_maxit-{max_iter}"
 def test_select_kmeans_clustering():
    inpath = "/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/"
    outpath = "test_kmeans";
    n_clusters=[200,300,400];
    n_init=[1,2,3];
    max_iter=[100000]
    gs = kmeans_lsi_grid_sweep(inpath, 'all', outpath, n_clusters, n_init, max_iter)
    gs.run(1)
    cluster_selection_epsilons=[0,0.1,0.3,0.5];
    cluster_selection_methods=['eom'];
    lsi_dimensions='all'
    gs = hdbscan_lsi_grid_sweep(inpath, "all", outpath, min_cluster_sizes, min_samples, cluster_selection_epsilons, cluster_selection_methods)
    gs.run(20)
    gs.save("test_hdbscan/lsi_sweep.csv")
 def run_kmeans_grid_sweep(savefile, inpath, outpath,  n_clusters=[500], n_inits=[1], max_iters=[3000]):
    """Run kmeans clustering once or more with different parameters.
    Usage:
    kmeans_clustering.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to feather data containing a labeled matrix of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    n_clusters: one or more numbers of kmeans clusters to select.
    n_inits: one or more numbers of different initializations to use for each clustering.
    max_iters: one or more numbers of different maximum interations. 
    """    
    obj = kmeans_grid_sweep(inpath,
                            outpath,
                            map(int,n_clusters),
                            map(int,n_inits),
                            map(int,max_iters))
    obj.run(1)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_kmeans_grid_sweep)
--- a/clustering/kmeans_clustering_lsi.py
+++ b/clustering/kmeans_clustering_lsi.py
@@ -0,0 +1,93 @@
 import fire
 from dataclasses import dataclass
 from kmeans_clustering import kmeans_job, kmeans_clustering_result, kmeans_grid_sweep
 from lsi_base import lsi_mixin, lsi_result_mixin, lsi_grid_sweep
 from grid_sweep import grid_sweep
@dataclass
 class kmeans_clustering_result_lsi(kmeans_clustering_result, lsi_result_mixin):
    pass
 class kmeans_lsi_job(kmeans_job, lsi_mixin):
    def __init__(self, infile, outpath, name, lsi_dims, *args, **kwargs):
        super().__init__(infile,
                         outpath,
                         name,
                         *args,
                         **kwargs)
        super().set_lsi_dims(lsi_dims)
    def get_info(self):
        result = super().get_info()
        self.result = kmeans_clustering_result_lsi(**result.__dict__,
                                                   lsi_dimensions=self.lsi_dims)
        return self.result
 class _kmeans_lsi_grid_sweep(grid_sweep):
    def __init__(self,
                 inpath,
                 outpath,
                 lsi_dim,
                 *args,
                 **kwargs):
        print(args)
        print(kwargs)
        self.lsi_dim = lsi_dim
        self.jobtype = kmeans_lsi_job
        super().__init__(self.jobtype, inpath, outpath, self.namer, [self.lsi_dim], *args, **kwargs)
    def namer(self, *args, **kwargs):
        s = kmeans_grid_sweep.namer(self, *args[1:], **kwargs)
        s += f"_lsi-{self.lsi_dim}"
        return s
 class kmeans_lsi_grid_sweep(lsi_grid_sweep):
    def __init__(self,
                 inpath,
                 lsi_dims,
                 outpath,
                 n_clusters,
                 n_inits,
                 max_iters
                 ):
        super().__init__(kmeans_lsi_job,
                         _kmeans_lsi_grid_sweep,
                         inpath,
                         lsi_dims,
                         outpath,
                         n_clusters,
                         n_inits,
                         max_iters)
 def run_kmeans_lsi_grid_sweep(savefile, inpath, outpath,  n_clusters=[500], n_inits=[1], max_iters=[3000], lsi_dimensions="all"):
    """Run kmeans clustering once or more with different parameters.
    Usage:
    kmeans_clustering_lsi.py --savefile=SAVEFILE --inpath=INPATH --outpath=OUTPATH d--lsi_dimensions=<"all"|csv number of LSI dimensions to use> --n_clusters=<csv number of clusters> --n_inits=<csv> --max_iters=<csv>
    Keword arguments:
    savefile: path to save the metadata and diagnostics 
    inpath: path to folder containing feather files with LSI similarity labeled matrices of subreddit similarities.
    outpath: path to output fit kmeans clusterings.
    lsi_dimensions: either "all" or one or more available lsi similarity dimensions at INPATH.
    n_clusters: one or more numbers of kmeans clusters to select.
    n_inits: one or more numbers of different initializations to use for each clustering.
    max_iters: one or more numbers of different maximum interations. 
    """    
    obj = kmeans_lsi_grid_sweep(inpath,
                                lsi_dimensions,
                                outpath,
                                list(map(int,n_clusters)),
                                list(map(int,n_inits)),
                                list(map(int,max_iters))
                                )
    obj.run(1)
    obj.save(savefile)
 if __name__ == "__main__":
    fire.Fire(run_kmeans_lsi_grid_sweep)
--- a/clustering/lsi_base.py
+++ b/clustering/lsi_base.py
@@ -0,0 +1,29 @@
 from clustering_base import clustering_job, clustering_result
 from grid_sweep import grid_sweep
 from dataclasses import dataclass
 from itertools import chain
 from pathlib import Path
 class lsi_mixin():
    def set_lsi_dims(self, lsi_dims):
        self.lsi_dims = lsi_dims
@dataclass
 class lsi_result_mixin:
    lsi_dimensions:int
 class lsi_grid_sweep(grid_sweep):
    def __init__(self, jobtype, subsweep, inpath, lsi_dimensions, outpath, *args, **kwargs):
        self.jobtype = jobtype
        self.subsweep = subsweep
        inpath = Path(inpath)
        if lsi_dimensions == 'all':
            lsi_paths = list(inpath.glob("*.feather"))
        else:
            lsi_paths = [inpath / (str(dim) + '.feather') for dim in lsi_dimensions]
        print(lsi_paths)
        lsi_nums = [int(p.stem) for p in lsi_paths]
        self.hasrun = False
        self.subgrids = [self.subsweep(lsi_path, outpath,  lsi_dim, *args, **kwargs) for lsi_dim, lsi_path in zip(lsi_nums, lsi_paths)]
        self.jobs = list(chain(*map(lambda gs: gs.jobs, self.subgrids)))
--- a/clustering/pick_best_clustering.py
+++ b/clustering/pick_best_clustering.py
@@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 import fire
 import pandas as pd
 from pathlib import Path
 import shutil
 selection_data="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/clustering/comment_authors_compex_LSI/selection_data.csv"
 outpath = 'test_best.feather'
 min_clusters=50; max_isolates=7500; min_cluster_size=2
 # pick the best clustering according to silhouette score subject to contraints
 def pick_best_clustering(selection_data, output, min_clusters, max_isolates, min_cluster_size):
    df = pd.read_csv(selection_data,index_col=0)
    df = df.sort_values("silhouette_score",ascending=False)
    # not sure I fixed the bug underlying this fully or not.
    df['n_isolates_str'] = df.n_isolates.str.strip("[]")
    df['n_isolates_0'] = df['n_isolates_str'].apply(lambda l: len(l) == 0)
    df.loc[df.n_isolates_0,'n_isolates'] = 0
    df.loc[~df.n_isolates_0,'n_isolates'] = df.loc[~df.n_isolates_0].n_isolates_str.apply(lambda l: int(l))
    best_cluster = df[(df.n_isolates <= max_isolates)&(df.n_clusters >= min_clusters)&(df.min_cluster_size==min_cluster_size)]
    best_cluster = best_cluster.iloc[0]
    best_lsi_dimensions = best_cluster.lsi_dimensions
    print(best_cluster.to_dict())
    best_path = Path(best_cluster.outpath) / (str(best_cluster['name']) + ".feather")
    shutil.copy(best_path,output)
    print(f"lsi dimensions:{best_lsi_dimensions}")
 if __name__ == "__main__":
    fire.Fire(pick_best_clustering)
--- a/clustering/selection.py
+++ b/clustering/selection.py
@@ -0,0 +1,38 @@
 import pandas as pd
 import plotnine as pn
 from pathlib import Path
 from clustering.fit_tsne import fit_tsne
 from visualization.tsne_vis import build_visualization
 df = pd.read_csv("/gscratch/comdata/output/reddit_clustering/subreddit_comment_authors-tf_10k_LSI/hdbscan/selection_data.csv",index_col=0)
 # plot silhouette_score as a function of isolates
 df = df.sort_values("silhouette_score")
 df['n_isolates'] = df.n_isolates.str.split("\n0").apply(lambda rg: int(rg[1]))
 p = pn.ggplot(df,pn.aes(x='n_isolates',y='silhouette_score')) + pn.geom_point()
 p.save("isolates_x_score.png")
 p = pn.ggplot(df,pn.aes(y='n_clusters',x='n_isolates',color='silhouette_score')) + pn.geom_point()
 p.save("clusters_x_isolates.png")
 # the best result for hdbscan seems like this one: it has a decent number of 
 # i think I prefer the 'eom' clustering style because larger clusters are less likely to suffer from ommitted variables
 best_eom = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='eom')&(df.min_cluster_size==2)].iloc[df.shape[1]]
 best_lsi = df[(df.n_isolates <5000)&(df.silhouette_score>0.4)&(df.cluster_selection_method=='leaf')&(df.min_cluster_size==2)].iloc[df.shape[1]]
 tsne_data = Path("./clustering/authors-tf_lsi850_tsne.feather")
 if not tnse_data.exists():
    fit_tsne("/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather",
             tnse_data)
 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
                    Path(best_eom.outpath)/(best_eom['name']+'.feather'),
                    "./authors-tf_lsi850_best_eom.html")
 build_visualization("./clustering/authors-tf_lsi850_tsne.feather",
                    Path(best_leaf.outpath)/(best_leaf['name']+'.feather'),
                    "./authors-tf_lsi850_best_leaf.html")
--- a/datasets/checkpoint_parallelsql.sbatch
+++ b/datasets/checkpoint_parallelsql.sbatch
--- a/datasets/comments_2_parquet.sh
+++ b/datasets/comments_2_parquet.sh
--- a/datasets/comments_2_parquet_part1.py
+++ b/datasets/comments_2_parquet_part1.py
--- a/datasets/comments_2_parquet_part2.py
+++ b/datasets/comments_2_parquet_part2.py
--- a/datasets/helper.py
+++ b/datasets/helper.py
--- a/datasets/job_script.sh
+++ b/datasets/job_script.sh
@@ -0,0 +1,4 @@
 #!/usr/bin/bash
 start_spark_cluster.sh
 spark-submit --master spark://$(hostname):18899 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/users/nathante/subreddit_term_similarity_weekly_5000.parquet --topN=5000
 stop-all.sh
--- a/datasets/submissions_2_parquet.sh
+++ b/datasets/submissions_2_parquet.sh
--- a/datasets/submissions_2_parquet_part1.py
+++ b/datasets/submissions_2_parquet_part1.py
--- a/datasets/submissions_2_parquet_part2.py
+++ b/datasets/submissions_2_parquet_part2.py
--- a/density/Makefile
+++ b/density/Makefile
@@ -0,0 +1,16 @@
 all: /gscratch/comdata/output/reddit_density/comment_terms_10000.feather /gscratch/comdata/output/reddit_density/comment_authors_10000.feather /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather
 /gscratch/comdata/output/reddit_density/comment_terms_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather /gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
 	start_spark_and_run.sh 1 overlap_density.py terms --inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather" --agg=pd.DataFrame.sum
 /gscratch/comdata/output/reddit_density/comment_authors_10000.feather:overlap_density.py /gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather" --outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather" --agg=pd.DataFrame.sum
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10000.feather" --agg=pd.DataFrame.sum
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/850.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/850.feather" --agg=pd.DataFrame.sum
 /gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather: overlap_density.py /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather
 	start_spark_and_run.sh 1 overlap_density.py authors --inpath="/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather" --outpath="/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather" --agg=pd.DataFrame.sum
--- a/density/job_script.sh
+++ b/density/job_script.sh
@@ -0,0 +1,4 @@
 #!/usr/bin/bash
 start_spark_cluster.sh
 singularity exec  /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname).hyak.local:7077 overlap_density.py authors --inpath=/gscratch/comdata/output/reddit_similarity/subreddit_comment_authors-tf_10k_LSI/600.feather --outpath=/gscratch/comdata/output/reddit_density/subreddit_author_tf_similarities_10K_LSI/600.feather --agg=pd.DataFrame.sum
 singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
--- a/density/overlap_density.py
+++ b/density/overlap_density.py
@@ -0,0 +1,81 @@
 import pandas as pd
 from pandas.core.groupby import DataFrameGroupBy as GroupBy
 from pathlib import Path
 import fire
 import numpy as np
 import sys
 sys.path.append("..")
 sys.path.append("../similarities")
 from similarities.similarities_helper import reindex_tfidf
 # this is the mean of the ratio of the overlap to the focal size.
 # mean shared membership per focal community member
 # the input is the author tf-idf matrix
 def overlap_density(inpath, outpath, agg = pd.DataFrame.sum):
    df = pd.read_feather(inpath)
    df = df.drop('_subreddit',1)
    np.fill_diagonal(df.values,0)
    df = agg(df, 0).reset_index()
    df = df.rename({0:'overlap_density'},axis='columns')
    outpath = Path(outpath)
    outpath.parent.mkdir(parents=True, exist_ok = True)
    df.to_feather(outpath)
    return df
 def overlap_density_weekly(inpath, outpath, agg = GroupBy.sum):
    df = pd.read_parquet(inpath)
    # exclude the diagonal
    df = df.loc[df.subreddit != df.variable]
    res = agg(df.groupby(['subreddit','week'])).reset_index()
    outpath = Path(outpath)
    outpath.parent.mkdir(parents=True, exist_ok = True)
    res.to_feather(outpath)
    return res
 # inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet";
 # min_df=1;
 # included_subreddits=None;
 # topN=10000;
 # outpath="/gscratch/comdata/output/reddit_density/wang_overlaps_10000.feather"
 # to_date=2019-10-28
 def author_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather",
                           outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather", agg=pd.DataFrame.sum):
    if type(agg) == str:
        agg = eval(agg)
    overlap_density(inpath, outpath, agg)
 def term_overlap_density(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather",
                         outpath="/gscratch/comdata/output/reddit_density/comment_term_similarity_10000.feather", agg=pd.DataFrame.sum):
    if type(agg) == str:
        agg = eval(agg)
    overlap_density(inpath, outpath, agg)
 def author_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/subreddit_authors_10000_weekly.parquet",
                                  outpath="/gscratch/comdata/output/reddit_density/comment_authors_10000_weekly.feather", agg=GroupBy.sum):
    if type(agg) == str:
        agg = eval(agg)
    overlap_density_weekly(inpath, outpath, agg)
 def term_overlap_density_weekly(inpath="/gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet",
                                outpath="/gscratch/comdata/output/reddit_density/comment_terms_10000_weekly.parquet", agg=GroupBy.sum):
    if type(agg) == str:
        agg = eval(agg)
    overlap_density_weekly(inpath, outpath, agg)
 if __name__ == "__main__":
    fire.Fire({'authors':author_overlap_density,
               'terms':term_overlap_density,
               'author_weekly':author_overlap_density_weekly,
               'term_weekly':term_overlap_density_weekly})
--- a/dumps/check_comments_shas.py
+++ b/dumps/check_comments_shas.py
@@ -6,7 +6,7 @@ from os import path
 import hashlib
 shasums1 = requests.get("https://files.pushshift.io/reddit/comments/sha256sum.txt").text
-shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text
+#shasums2 = requests.get("https://files.pushshift.io/reddit/comments/daily/sha256sum.txt").text
 shasums = shasums1 + shasums2
 dumpdir = "/gscratch/comdata/raw_data/reddit_dumps/comments"
--- a/dumps/check_submission_shas.py
+++ b/dumps/check_submission_shas.py
--- a/dumps/pull_pushshift_comments.sh
+++ b/dumps/pull_pushshift_comments.sh
@@ -0,0 +1,12 @@
 #!/bin/bash
 user_agent='"nathante teblunthuis <nathante@uw.edu>"'
 output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments'
 base_url='https://files.pushshift.io/reddit/comments/'
 wget -r --no-parent -A 'RC_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RC_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RC_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
 ./check_comments_shas.py
--- a/dumps/pull_pushshift_submissions.sh
+++ b/dumps/pull_pushshift_submissions.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 user_agent='"nathante teblunthuis <nathante@uw.edu>"'
 output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions'
 base_url='https://files.pushshift.io/reddit/submissions/'
 wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RS_20*.bz2' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 wget -r --no-parent -A 'RS_20*.xz' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 wget -r --no-parent -A 'RS_20*.zst' --user-agent=$user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 ./check_submission_shas.py
--- a/ngrams/#ngrams_helper.py#
+++ b/ngrams/#ngrams_helper.py#
--- a/ngrams/checkpoint_parallelsql.sbatch
+++ b/ngrams/checkpoint_parallelsql.sbatch
@@ -0,0 +1,26 @@
 #!/bin/bash
 ## parallel_sql_job.sh
 #SBATCH --job-name=tf_subreddit_comments
 ## Allocation Definition
 #SBATCH --account=comdata-ckpt
 #SBATCH --partition=ckpt
 ## Resources
 ## Nodes. This should always be 1 for parallel-sql.
 #SBATCH --nodes=1    
 ## Walltime (12 hours)
 #SBATCH --time=12:00:00
 ## Memory per node
 #SBATCH --mem=32G
 #SBATCH --cpus-per-task=4
 #SBATCH --ntasks=1
 #SBATCH -D /gscratch/comdata/users/nathante/cdsc-reddit
 source ./bin/activate
 module load parallel_sql
 echo $(which perl)
 conda list pyarrow
 which python3
 #Put here commands to load other modules (e.g. matlab etc.)
 #Below command means that parallel_sql will get tasks from the database
 #and run them on the node (in parallel). So a 16 core node will have
 #16 tasks running at one time.
 parallel-sql --sql -a parallel --exit-on-term --jobs 4
--- a/ngrams/run_tf_jobs.sh
+++ b/ngrams/run_tf_jobs.sh
--- a/ngrams/sort_tf_comments.py
+++ b/ngrams/sort_tf_comments.py
--- a/ngrams/tf_comments.py
+++ b/ngrams/tf_comments.py
@@ -7,17 +7,13 @@ from itertools import groupby, islice, chain
 import fire
 from collections import Counter
 import os
 import datetime
 import re
 from nltk import wordpunct_tokenize, MWETokenizer, sent_tokenize
 from nltk.corpus import stopwords
 from nltk.util import ngrams
 import string
 from random import random
-
+from redditcleaner import clean
 # remove urls
 # taken from https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
 urlregex = re.compile(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
 # compute term frequencies for comments in each subreddit by week
 def weekly_tf(partition, mwe_pass = 'first'):
@@ -31,8 +27,8 @@ def weekly_tf(partition, mwe_pass = 'first'):
    ngram_output = partition.replace("parquet","txt")
    if mwe_pass == 'first':
-        if os.path.exists(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}"):
+        if os.path.exists(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}"):
-            os.remove(f"/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}")
+            os.remove(f"/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}")
    batches = dataset.to_batches(columns=['CreatedAt','subreddit','body','author'])
@@ -67,7 +63,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
    subreddit_weeks = groupby(rows, lambda r: (r.subreddit, r.week))
    if mwe_pass != 'first':
-        mwe_dataset = pd.read_feather(f'/gscratch/comdata/users/nathante/reddit_multiword_expressions.feather')
+        mwe_dataset = pd.read_feather(f'/gscratch/comdata/output/reddit_ngrams/multiword_expressions.feather')
        mwe_dataset = mwe_dataset.sort_values(['phrasePWMI'],ascending=False)
        mwe_phrases = list(mwe_dataset.phrase)
        mwe_phrases = [tuple(s.split(' ')) for s in mwe_phrases]
@@ -88,7 +84,6 @@ def weekly_tf(partition, mwe_pass = 'first'):
                new_sentence.append(new_token)
        return new_sentence
    stopWords = set(stopwords.words('english'))
    # we follow the approach described in datta, phelan, adar 2017
@@ -97,8 +92,8 @@ def weekly_tf(partition, mwe_pass = 'first'):
        # lowercase        
        text = text.lower()
-        # remove urls
+        # redditcleaner removes reddit markdown(newlines, quotes, bullet points, links, strikethrough, spoiler, code, superscript, table, headings)
-        text = urlregex.sub("", text)
+        text = clean(text)
        # sentence tokenize
        sentences = sent_tokenize(text)
@@ -109,19 +104,18 @@ def weekly_tf(partition, mwe_pass = 'first'):
        # remove punctuation
        sentences = map(remove_punct, sentences)
        # remove sentences with less than 2 words
        sentences = filter(lambda sentence: len(sentence) > 2, sentences)
        # datta et al. select relatively common phrases from the reddit corpus, but they don't really explain how. We'll try that in a second phase.
        # they say that the extract 1-4 grams from 10% of the sentences and then find phrases that appear often relative to the original terms
        # here we take a 10 percent sample of sentences 
        if mwe_pass == 'first':
            # remove sentences with less than 2 words
            sentences = filter(lambda sentence: len(sentence) > 2, sentences)
            sentences = list(sentences)
            for sentence in sentences:
                if random() <= 0.1:
                    grams = list(chain(*map(lambda i : ngrams(sentence,i),range(4))))
-                    with open(f'/gscratch/comdata/users/nathante/reddit_comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
+                    with open(f'/gscratch/comdata/output/reddit_ngrams/comment_ngrams_10p_sample/{ngram_output}','a') as gram_file:
                        for ng in grams:
                            gram_file.write(' '.join(ng) + '\n')
                for token in sentence:
@@ -156,7 +150,7 @@ def weekly_tf(partition, mwe_pass = 'first'):
    outchunksize = 10000
-    with pq.ParquetWriter(f"/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
+    with pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet/{partition}",schema=schema,compression='snappy',flavor='spark') as writer, pq.ParquetWriter(f"/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet/{partition}",schema=author_schema,compression='snappy',flavor='spark') as author_writer:
        while True:
--- a/ngrams/top_comment_phrases.py
+++ b/ngrams/top_comment_phrases.py
--- a/old/#tfidf_authors.py#
+++ b/old/#tfidf_authors.py#
@@ -0,0 +1,21 @@
 from pyspark.sql import SparkSession
 from similarities_helper import build_tfidf_dataset
 import pandas as pd
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 # remove [deleted] and AutoModerator (TODO remove other bots)
 df = df.filter(df.author != '[deleted]')
 df = df.filter(df.author != 'AutoModerator')
 df = build_tfidf_dataset(df, include_subs, 'author')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet',mode='overwrite',compression='snappy')
 spark.stop()
--- a/old/#tfidf_comments_weekly.py#
+++ b/old/#tfidf_comments_weekly.py#
@@ -0,0 +1,27 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from similarities_helper import build_weekly_tfidf_dataset
 import pandas as pd
 ## TODO:need to exclude automoderator / bot posts.
 ## TODO:need to exclude better handle hyperlinks. 
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 # remove [deleted] and AutoModerator (TODO remove other bots)
 # df = df.filter(df.author != '[deleted]')
 # df = df.filter(df.author != 'AutoModerator')
 df = build_weekly_tfidf_dataset(df, include_subs, 'term')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
 spark.stop()
--- a/old/author_cosine_similarity.py
+++ b/old/author_cosine_similarity.py
@@ -0,0 +1,106 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 import numpy as np
 import pyarrow
 import pandas as pd
 import fire
 from itertools import islice
 from pathlib import Path
 from similarities_helper import *
 #tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/subreddit_terms.parquet')
 def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, included_subreddits = None, topN = 500):
    spark = SparkSession.builder.getOrCreate()
    conf = spark.sparkContext.getConf()
    print(outfile)
    tfidf = spark.read.parquet(tfidf_path)
    if included_subreddits is None:
        included_subreddits = select_topN_subreddits(topN)
    else:
        included_subreddits = set(open(included_subreddits))
    print("creating temporary parquet with matrix indicies")
    tempdir = prep_tfidf_entries_weekly(tfidf, term_colname, min_df, included_subreddits)
    tfidf = spark.read.parquet(tempdir.name)
    # the ids can change each week.
    subreddit_names = tfidf.select(['subreddit','subreddit_id_new','week']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
    spark.stop()
    weeks = list(subreddit_names.week.drop_duplicates())
    for week in weeks:
        print("loading matrix")
        mat = read_tfidf_matrix_weekly(tempdir.name, term_colname, week)
        print('computing similarities')
        sims = column_similarities(mat)
        del mat
        names = subreddit_names.loc[subreddit_names.week==week]
        sims = sims.rename({i:sr for i, sr in enumerate(names.subreddit.values)},axis=1)
        sims['subreddit'] = names.subreddit.values
        write_weekly_similarities(outfile, sims, week)
 def cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500):
    '''
    Compute similarities between subreddits based on tfi-idf vectors of author comments
    included_subreddits : string
        Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
    min_df : int (default = 0.1 * (number of included_subreddits)
         exclude terms that appear in fewer than this number of documents.
    outfile: string
         where to output csv and feather outputs
 '''
    spark = SparkSession.builder.getOrCreate()
    conf = spark.sparkContext.getConf()
    print(outfile)
    tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet')
    if included_subreddits is None:
        included_subreddits = select_topN_subreddits(topN)
    else:
        included_subreddits = set(open(included_subreddits))
    print("creating temporary parquet with matrix indicies")
    tempdir = prep_tfidf_entries(tfidf, 'author', min_df, included_subreddits)
    tfidf = spark.read.parquet(tempdir.name)
    subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
    spark.stop()
    print("loading matrix")
    mat = read_tfidf_matrix(tempdir.name,'author')
    print('computing similarities')
    sims = column_similarities(mat)
    del mat
    sims = pd.DataFrame(sims.todense())
    sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
    sims['subreddit'] = subreddit_names.subreddit.values
    p = Path(outfile)
    output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
    output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
    output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
    sims.to_feather(outfile)
    tempdir.cleanup()
 if __name__ == '__main__':
    fire.Fire(author_cosine_similarities)
--- a/old/term_cosine_similarity.py
+++ b/old/term_cosine_similarity.py
@@ -0,0 +1,61 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
 import numpy as np
 import pyarrow
 import pandas as pd
 import fire
 from itertools import islice
 from pathlib import Path
 from similarities_helper import prep_tfidf_entries, read_tfidf_matrix, column_similarities, select_topN
 import scipy
 # outfile='test_similarities_500.feather';
 # min_df = None;
 # included_subreddits=None; topN=100; exclude_phrases=True;
 def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500, exclude_phrases=False):
    spark = SparkSession.builder.getOrCreate()
    conf = spark.sparkContext.getConf()
    print(outfile)
    print(exclude_phrases)
    tfidf = spark.read.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_terms.parquet')
    if included_subreddits is None:
        included_subreddits = select_topN_subreddits(topN)
    else:
        included_subreddits = set(open(included_subreddits))
    if exclude_phrases == True:
        tfidf = tfidf.filter(~f.col(term).contains("_"))
    print("creating temporary parquet with matrix indicies")
    tempdir = prep_tfidf_entries(tfidf, 'term', min_df, included_subreddits)
    tfidf = spark.read.parquet(tempdir.name)
    subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
    spark.stop()
    print("loading matrix")
    mat = read_tfidf_matrix(tempdir.name,'term')
    print('computing similarities')
    sims = column_similarities(mat)
    del mat
    sims = pd.DataFrame(sims.todense())
    sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
    sims['subreddit'] = subreddit_names.subreddit.values
    p = Path(outfile)
    output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
    output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
    output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
    sims.to_feather(outfile)
    tempdir.cleanup()
 if __name__ == '__main__':
    fire.Fire(term_cosine_similarities)
--- a/old/tfidf_authors.py
+++ b/old/tfidf_authors.py
@@ -0,0 +1,21 @@
 from pyspark.sql import SparkSession
 from similarities_helper import build_tfidf_dataset
 import pandas as pd
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 # remove [deleted] and AutoModerator (TODO remove other bots)
 df = df.filter(df.author != '[deleted]')
 df = df.filter(df.author != 'AutoModerator')
 df = build_tfidf_dataset(df, include_subs, 'author')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf/subreddit_comment_authors.parquet',mode='overwrite',compression='snappy')
 spark.stop()
--- a/old/tfidf_authors_weekly.py
+++ b/old/tfidf_authors_weekly.py
@@ -0,0 +1,21 @@
 from pyspark.sql import SparkSession
 from similarities_helper import build_weekly_tfidf_dataset
 import pandas as pd
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 # remove [deleted] and AutoModerator (TODO remove other bots)
 df = df.filter(df.author != '[deleted]')
 df = df.filter(df.author != 'AutoModerator')
 df = build_weekly_tfidf_dataset(df, include_subs, 'author')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet', mode='overwrite', compression='snappy')
 spark.stop()
--- a/old/tfidf_comments.py
+++ b/old/tfidf_comments.py
@@ -0,0 +1,18 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from similarities_helper import build_tfidf_dataset
 ## TODO:need to exclude automoderator / bot posts.
 ## TODO:need to exclude better handle hyperlinks. 
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 df = build_tfidf_dataset(df, include_subs, 'term')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/reddit_similarity/subreddit_terms.parquet',mode='overwrite',compression='snappy')
 spark.stop()
--- a/old/tfidf_comments_weekly.py
+++ b/old/tfidf_comments_weekly.py
@@ -0,0 +1,27 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from similarities_helper import build_weekly_tfidf_dataset
 import pandas as pd
 ## TODO:need to exclude automoderator / bot posts.
 ## TODO:need to exclude better handle hyperlinks. 
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 # remove [deleted] and AutoModerator (TODO remove other bots)
 # df = df.filter(df.author != '[deleted]')
 # df = df.filter(df.author != 'AutoModerator')
 df = build_weekly_tfidf_dataset(df, include_subs, 'term')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
 spark.stop()
--- a/pull_pushshift_comments.sh
+++ b/pull_pushshift_comments.sh
@@ -1,14 +0,0 @@
 #!/bin/bash
 user_agent='nathante teblunthuis <nathante@uw.edu>'
 output_dir='/gscratch/comdata/raw_data/reddit_dumps/comments'
 base_url='https://files.pushshift.io/reddit/comments/'
 wget -r --no-parent -A 'RC_201*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RC_201*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RC_201*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
 # starting in 2020 we use daily dumps not monthly dumps
 wget -r --no-parent -A 'RC_202*.gz' -U $user_agent -P $output_dir -nd -nc $base_url/daily/
 ./check_comments_shas.py
--- a/pull_pushshift_submissions.sh
+++ b/pull_pushshift_submissions.sh
@@ -1,14 +0,0 @@
 #!/bin/bash
 user_agent='nathante teblunthuis <nathante@uw.edu>'
 output_dir='/gscratch/comdata/raw_data/reddit_dumps/submissions'
 base_url='https://files.pushshift.io/reddit/submissions/'
 wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url
 wget -r --no-parent -A 'RS_20*.bz2' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 wget -r --no-parent -A 'RS_20*.xz' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 wget -r --no-parent -A 'RS_20*.zst' -U $user_agent -P $output_dir -nd -nc $base_url/old_v1_data/
 ./check_submission_shas.py
--- a/similarities/#tfidf_weekly.py#
+++ b/similarities/#tfidf_weekly.py#
@@ -0,0 +1,24 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from similarities_helper import build_weekly_tfidf_dataset
 import pandas as pd
 def tfidf_weekly(inpath, outpath, topN, term_colname, exclude):
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet")
 include_subs = pd.read_csv("/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv")
 include_subs = set(include_subs.loc[include_subs.comments_rank <= 25000]['subreddit'])
 # remove [deleted] and AutoModerator (TODO remove other bots)
 # df = df.filter(df.author != '[deleted]')
 # df = df.filter(df.author != 'AutoModerator')
 df = build_weekly_tfidf_dataset(df, include_subs, 'term')
 df.write.parquet('/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', mode='overwrite', compression='snappy')
 spark.stop()
--- a/similarities/Makefile
+++ b/similarities/Makefile
@@ -0,0 +1,130 @@
 #all: /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_130k.parquet
 srun_singularity=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity.sh
 srun_singularity_huge=source /gscratch/comdata/users/nathante/cdsc_reddit/bin/activate && srun_singularity_huge.sh
 base_data=/gscratch/comdata/output
 similarity_data=${base_data}/reddit_similarity
 tfidf_data=${similarity_data}/tfidf
 tfidf_weekly_data=${similarity_data}/tfidf_weekly
 similarity_weekly_data=${similarity_data}/weekly
 lsi_components=[10,50,100,200,300,400,500,600,700,850,1000,1500]
 lsi_similarities: ${similarity_data}/subreddit_comment_terms_10k_LSI ${similarity_data}/subreddit_comment_authors-tf_10k_LSI ${similarity_data}/subreddit_comment_authors_10k_LSI ${similarity_data}/subreddit_comment_terms_30k_LSI ${similarity_data}/subreddit_comment_authors-tf_30k_LSI ${similarity_data}/subreddit_comment_authors_30k_LSI
 all: ${tfidf_data}/comment_terms_100k.parquet ${tfidf_data}/comment_terms_30k.parquet ${tfidf_data}/comment_terms_10k.parquet ${tfidf_data}/comment_authors_100k.parquet ${tfidf_data}/comment_authors_30k.parquet ${tfidf_data}/comment_authors_10k.parquet ${similarity_data}/subreddit_comment_authors_30k.feather ${similarity_data}/subreddit_comment_authors_10k.feather  ${similarity_data}/subreddit_comment_terms_10k.feather ${similarity_data}/subreddit_comment_terms_30k.feather ${similarity_data}/subreddit_comment_authors-tf_30k.feather ${similarity_data}/subreddit_comment_authors-tf_10k.feather ${similarity_data}/subreddit_comment_terms_100k.feather ${similarity_data}/subreddit_comment_authors_100k.feather ${similarity_data}/subreddit_comment_authors-tf_100k.feather ${similarity_weekly_data}/comment_terms.parquet
 #${tfidf_weekly_data}/comment_terms_100k.parquet ${tfidf_weekly_data}/comment_authors_100k.parquet ${tfidf_weekly_data}/comment_terms_30k.parquet ${tfidf_weekly_data}/comment_authors_30k.parquet ${similarity_weekly_data}/comment_terms_100k.parquet ${similarity_weekly_data}/comment_authors_100k.parquet  ${similarity_weekly_data}/comment_terms_30k.parquet ${similarity_weekly_data}/comment_authors_30k.parquet
 # /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_130k.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_130k.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_weekly_130k.parquet
 # all: /gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_25000.parquet /gscratch/comdata/output/reddit_similarity/subreddit_comment_authors_10000.parquet /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet
 ${similarity_weekly_data}/comment_terms.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms.parquet
 	${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=10000 --outfile=${similarity_weekly_data}/comment_terms.parquet
 ${similarity_data}/subreddit_comment_terms_10k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k.feather --topN=10000
 ${similarity_data}/subreddit_comment_terms_10k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=200
 ${similarity_data}/subreddit_comment_terms_30k_LSI: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	${srun_singularity} python3 lsi_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=200
 ${similarity_data}/subreddit_comment_terms_30k.feather: ${tfidf_data}/comment_terms_30k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_30k.feather --topN=30000
 ${similarity_data}/subreddit_comment_authors_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k.feather --topN=30000
 ${similarity_data}/subreddit_comment_authors_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k.feather --topN=10000
 ${similarity_data}/subreddit_comment_authors_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2
 ${similarity_data}/subreddit_comment_authors_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	${srun_singularity} python3 lsi_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2
 ${similarity_data}/subreddit_comment_authors-tf_30k.feather: ${tfidf_data}/comment_authors_30k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k.feather --topN=30000
 ${similarity_data}/subreddit_comment_authors-tf_10k.feather: ${tfidf_data}/comment_authors_10k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k.feather --topN=10000
 ${similarity_data}/subreddit_comment_authors-tf_10k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_10k_LSI --topN=10000 --n_components=${lsi_components} --min_df=2
 ${similarity_data}/subreddit_comment_authors-tf_30k_LSI: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	${srun_singularity} python3 lsi_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_30k_LSI --topN=30000 --n_components=${lsi_components} --min_df=2
 ${similarity_data}/subreddit_comment_terms_100k.feather: ${tfidf_data}/comment_terms_100k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py term --outfile=${similarity_data}/subreddit_comment_terms_100k.feather --topN=100000
 ${similarity_data}/subreddit_comment_authors_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py author --outfile=${similarity_data}/subreddit_comment_authors_100k.feather --topN=100000
 ${similarity_data}/subreddit_comment_authors-tf_100k.feather: ${tfidf_data}/comment_authors_100k.parquet similarities_helper.py
 	${srun_singularity} python3 cosine_similarities.py author-tf --outfile=${similarity_data}/subreddit_comment_authors-tf_100k.feather --topN=100000
 ${tfidf_data}/comment_terms_100k.feather/: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 4 tfidf.py terms --topN=100000 --outpath=${tfidf_data}/comment_terms_100k.feather 
 ${tfidf_data}/comment_terms_30k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 4 tfidf.py terms --topN=30000 --outpath=${tfidf_data}/comment_terms_30k.feather
 ${tfidf_data}/comment_terms_10k.feather: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 4 tfidf.py terms --topN=10000 --outpath=${tfidf_data}/comment_terms_10k.feather
 ${tfidf_data}/comment_authors_100k.feather: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 4 tfidf.py authors --topN=100000 --outpath=${tfidf_data}/comment_authors_100k.feather
 ${tfidf_data}/comment_authors_10k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 4 tfidf.py authors --topN=10000 --outpath=${tfidf_data}/comment_authors_10k.parquet
 ${tfidf_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	mkdir -p ${tfidf_data}/
 	start_spark_and_run.sh 4 tfidf.py authors --topN=30000 --outpath=${tfidf_data}/comment_authors_30k.parquet
 ${tfidf_data}/tfidf_weekly/comment_terms_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	start_spark_and_run.sh 4 tfidf.py terms_weekly --topN=100000 --outpath=${similarity_data}/tfidf_weekly/comment_authors_100k.parquet
 ${tfidf_data}/tfidf_weekly/comment_authors_100k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_ppnum_comments.csv
 	start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=100000 --outpath=${tfidf_weekly_data}/comment_authors_100k.parquet
 ${tfidf_weekly_data}/comment_terms_30k.parquet:  /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	start_spark_and_run.sh 2 tfidf.py terms_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet
 ${tfidf_weekly_data}/comment_authors_30k.parquet: /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv
 	start_spark_and_run.sh 4 tfidf.py authors_weekly --topN=30000 --outpath=${tfidf_weekly_data}/comment_authors_30k.parquet
 ${similarity_weekly_data}/comment_terms_100k.parquet: weekly_cosine_similarities.py similarities_helper.py ${tfidf_weekly_data}/comment_terms_100k.parquet
 	${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet
 ${similarity_weekly_data}/comment_authors_100k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_100k.parquet
 	${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=100000 --outfile=${similarity_weekly_data}/comment_authors_100k.parquet
 ${similarity_weekly_data}/comment_terms_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_terms_30k.parquet
 	${srun_singularity} python3 weekly_cosine_similarities.py terms --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet
 ${similarity_weekly_data}/comment_authors_30k.parquet: weekly_cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_terms.parquet ${similarity_data}/subreddits_by_num_comments.csv ${tfidf_weekly_data}/comment_authors_30k.parquet
 	${srun_singularity} python3 weekly_cosine_similarities.py authors --topN=30000 --outfile=${similarity_weekly_data}/comment_authors_30k.parquet
 # ${tfidf_weekly_data}/comment_authors_130k.parquet: tfidf.py similarities_helper.py /gscratch/comdata/output/reddit_ngrams/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments.csv
 # 	start_spark_and_run.sh 1 tfidf.py authors_weekly --topN=130000
 # /gscratch/comdata/output/reddit_similarity/comment_authors_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet 
 # 	start_spark_and_run.sh 1 cosine_similarities.py author --outfile=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather
 # /gscratch/comdata/output/reddit_similarity/comment_terms.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet
 # 	start_spark_and_run.sh 1 cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/comment_terms_10000.feather
 # /gscratch/comdata/output/reddit_similarity/comment_terms_10000_weekly.parquet: cosine_similarities.py ${tfidf_weekly_data}/comment_authors.parquet
 # 	start_spark_and_run.sh 1 weekly_cosine_similarities.py term --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_comment_terms_10000_weely.parquet
 # /gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet: cosine_similarities.py similarities_helper.py /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet /gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet
 # 	start_spark_and_run.sh 1 cosine_similarities.py author-tf --outfile=/gscratch/comdata/output/reddit_similarity/subreddit_author_tf_similarities_10000.parquet
--- a/similarities/pycache/similarities_helper.cpython-37.pyc
+++ b/similarities/pycache/similarities_helper.cpython-37.pyc
--- a/similarities/cosine_similarities.py
+++ b/similarities/cosine_similarities.py
@@ -0,0 +1,60 @@
 import pandas as pd
 import fire
 from pathlib import Path
 from cdsc_ecology_utils.similarity import similarities, column_similarities
 from functools import partial
 def cosine_similarities(infile, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None, tfidf_colname='tf_idf'):
    return similarities(infile=infile, simfunc=column_similarities, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_communities=included_subreddits, topN=topN, exclude_phrases=exclude_phrases,from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
 # change so that these take in an input as an optional argument (for speed, but also for idf).
 def term_cosine_similarities(outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
 def term_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500, exclude_phrases=False, from_date=None, to_date=None):
    return cosine_similarities(infile,
                               'term',
                               outfile,
                               min_df,
                               max_df,
                               included_subreddits,
                               topN,
                               exclude_phrases,
                               from_date,
                               to_date
                               )
 def author_cosine_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
    return cosine_similarities(infile,
                               'author',
                               outfile,
                               min_df,
                               max_df,
                               included_subreddits,
                               topN,
                               exclude_phrases=False,
                               from_date=from_date,
                               to_date=to_date
                               )
 def author_tf_similarities(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet', min_df=2, max_df=None, included_subreddits=None, topN=10000, from_date=None, to_date=None):
    return cosine_similarities(infile,
                               'author',
                               outfile,
                               min_df,
                               max_df,
                               included_subreddits,
                               topN,
                               exclude_phrases=False,
                               from_date=from_date,
                               to_date=to_date,
                               tfidf_colname='relative_tf'
                               )
 if __name__ == "__main__":
    fire.Fire({'term':term_cosine_similarities,
               'author':author_cosine_similarities,
               'author-tf':author_tf_similarities})
--- a/similarities/job_script.sh
+++ b/similarities/job_script.sh
@@ -0,0 +1,4 @@
 #!/usr/bin/bash
 start_spark_cluster.sh
 singularity exec  /gscratch/comdata/users/nathante/cdsc_base.sif spark-submit --master spark://$(hostname):7077 top_subreddits_by_comments.py 
 singularity exec /gscratch/comdata/users/nathante/cdsc_base.sif stop-all.sh
--- a/similarities/lsi_similarities.py
+++ b/similarities/lsi_similarities.py
@@ -0,0 +1,85 @@
 import pandas as pd
 import fire
 from pathlib import Path
 from cdsc_ecology_utils.similarity.similarity_functions import lsi_column_similarities, similarities, 
 #from similarities_helper import similarities, lsi_column_similarities
 from functools import partial
 inpath = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_terms_compex.parquet/"
 term_colname='term'
 outfile='/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI'
 n_components=[10,50,100]
 included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"
 n_iter=5
 random_state=1968
 algorithm='arpack'
 topN = None
 from_date=None
 to_date=None
 min_df=None
 max_df=None
 def lsi_similarities(inpath, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, tfidf_colname='tf_idf',n_components=100,n_iter=5,random_state=1968,algorithm='arpack',lsi_model=None):
    print(n_components,flush=True)
    if lsi_model is None:
        if type(n_components) == list:
            lsi_model = Path(outfile) / f'{max(n_components)}_{term_colname}_LSIMOD.pkl'
        else:
            lsi_model = Path(outfile) / f'{n_components}_{term_colname}_LSIMOD.pkl'
    simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm=algorithm,lsi_model_save=lsi_model)
    return similarities(inpath=inpath, simfunc=simfunc, term_colname=term_colname, outfile=outfile, min_df=min_df, max_df=max_df, included_communities=included_subreddits, topN=topN, from_date=from_date, to_date=to_date, tfidf_colname=tfidf_colname)
 # change so that these take in an input as an optional argument (for speed, but also for idf).
 def term_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet',outfile=None, min_df=None, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None, algorithm='arpack', n_components=300,n_iter=5,random_state=1968):
    res =  lsi_similarities(inpath,
                            'term',
                            outfile,
                            min_df,
                            max_df,
                            included_subreddits,
                            topN,
                            from_date,
                            to_date,
                            n_components=n_components,
                            algorithm = algorithm
                            )
    return res
 def author_lsi_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,algorithm='arpack',n_components=300,n_iter=5,random_state=1968):
    return lsi_similarities(inpath,
                            'author',
                            outfile,
                            min_df,
                            max_df,
                            included_subreddits,
                            topN,
                            from_date=from_date,
                            to_date=to_date,
                            n_components=n_components
                               )
 def author_tf_similarities(inpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors_100k.parquet',outfile=None, min_df=2, max_df=None, included_subreddits=None, topN=None, from_date=None, to_date=None,n_components=300,n_iter=5,random_state=1968):
    return lsi_similarities(inpath,
                            'author',
                            outfile,
                            min_df,
                            max_df,
                            included_subreddits,
                            topN,
                            from_date=from_date,
                            to_date=to_date,
                            tfidf_colname='relative_tf',
                            n_components=n_components,
                            algorithm=algorithm
                            )
 if __name__ == "__main__":
    fire.Fire({'term':term_lsi_similarities,
               'author':author_lsi_similarities,
               'author-tf':author_tf_similarities})
--- a/similarities/similarities_helper.py
+++ b/similarities/similarities_helper.py
@@ -0,0 +1,409 @@
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.sql import functions as f
 from enum import Enum
 from multiprocessing import cpu_count, Pool
 from pyspark.mllib.linalg.distributed import CoordinateMatrix
 from tempfile import TemporaryDirectory
 import pyarrow
 import pyarrow.dataset as ds
 from sklearn.metrics import pairwise_distances
 from scipy.sparse import csr_matrix, issparse
 from sklearn.decomposition import TruncatedSVD
 import pandas as pd
 import numpy as np
 import pathlib
 from datetime import datetime
 from pathlib import Path
 import pickle
 class tf_weight(Enum):
    MaxTF = 1
    Norm05 = 2
 # infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet"
 # cache_file = "/gscratch/comdata/users/nathante/cdsc_reddit/similarities/term_tfidf_entries_bak.parquet"
 # subreddits missing after this step don't have any terms that have a high enough idf
 # try rewriting without merges
 # does reindex_tfidf, but without reindexing.
 def reindex_tfidf(*args, **kwargs):
    df, tfidf_ds, ds_filter = _pull_or_reindex_tfidf(*args, **kwargs, reindex=True)
    print("assigning names")
    subreddit_names = tfidf_ds.to_table(filter=ds_filter,columns=['subreddit','subreddit_id'])
    batches = subreddit_names.to_batches()
    with Pool(cpu_count()) as pool:
        chunks = pool.imap_unordered(pull_names,batches) 
        subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
        subreddit_names = subreddit_names.set_index("subreddit_id")
    new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
    new_ids = new_ids.set_index('subreddit_id')
    subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
    subreddit_names = subreddit_names.drop("subreddit_id",1)
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    return(df, subreddit_names)
 def pull_tfidf(*args, **kwargs):
    df, _, _ =  _pull_or_reindex_tfidf(*args, **kwargs, reindex=False)
    return df
 def _pull_or_reindex_tfidf(infile, term_colname, min_df=None, max_df=None, included_subreddits=None, topN=500, week=None, from_date=None, to_date=None, rescale_idf=True, tf_family=tf_weight.MaxTF, reindex=True):
    print(f"loading tfidf {infile}", flush=True)
    if week is not None:
        tfidf_ds = ds.dataset(infile, partitioning='hive')
    else: 
        tfidf_ds = ds.dataset(infile)
    if included_subreddits is None:
        included_subreddits = select_topN_subreddits(topN)
    else:
        included_subreddits = set(map(str.strip,open(included_subreddits)))
    ds_filter = ds.field("subreddit").isin(included_subreddits)
    if min_df is not None:
        ds_filter &= ds.field("count") >= min_df
    if max_df is not None:
        ds_filter &= ds.field("count") <= max_df
    if week is not None:
        ds_filter &= ds.field("week") == week
    if from_date is not None:
        ds_filter &= ds.field("week") >= from_date
    if to_date is not None:
        ds_filter &= ds.field("week") <= to_date
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    projection = {
        'subreddit_id':ds.field('subreddit_id'),
        term_id:ds.field(term_id),
        'relative_tf':ds.field("relative_tf").cast('float32')
        }
    if not rescale_idf:
        projection = {
            'subreddit_id':ds.field('subreddit_id'),
            term_id:ds.field(term_id),
            'relative_tf':ds.field('relative_tf').cast('float32'),
            'tf_idf':ds.field('tf_idf').cast('float32')}
        print(projection)
    df = tfidf_ds.to_table(filter=ds_filter,columns=projection)
    df = df.to_pandas(split_blocks=True,self_destruct=True)
    print("assigning indexes",flush=True)
    if reindex:
        df['subreddit_id_new'] = df.groupby("subreddit_id").ngroup()
    else:
        df['subreddit_id_new'] = df['subreddit_id']
    if reindex:
        grouped = df.groupby(term_id)
        df[term_id_new] = grouped.ngroup()
    else:
        df[term_id_new] = df[term_id]
    if rescale_idf:
        print("computing idf", flush=True)
        df['new_count'] = grouped[term_id].transform('count')
        N_docs = df.subreddit_id_new.max() + 1
        df['idf'] = np.log(N_docs/(1+df.new_count),dtype='float32') + 1
        if tf_family == tf_weight.MaxTF:
            df["tf_idf"] = df.relative_tf * df.idf
        else: # tf_fam = tf_weight.Norm05
            df["tf_idf"] = (0.5 + 0.5 * df.relative_tf) * df.idf
    return (df, tfidf_ds, ds_filter)
    with Pool(cpu_count()) as pool:
        chunks = pool.imap_unordered(pull_names,batches) 
        subreddit_names = pd.concat(chunks,copy=False).drop_duplicates()
    subreddit_names = subreddit_names.set_index("subreddit_id")
    new_ids = df.loc[:,['subreddit_id','subreddit_id_new']].drop_duplicates()
    new_ids = new_ids.set_index('subreddit_id')
    subreddit_names = subreddit_names.join(new_ids,on='subreddit_id').reset_index()
    subreddit_names = subreddit_names.drop("subreddit_id",1)
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    return(df, subreddit_names)
 def pull_names(batch):
    return(batch.to_pandas().drop_duplicates())
 def similarities(inpath, simfunc, term_colname, outfile, min_df=None, max_df=None, included_subreddits=None, topN=500, from_date=None, to_date=None, tfidf_colname='tf_idf'):
    '''
    tfidf_colname: set to 'relative_tf' to use normalized term frequency instead of tf-idf, which can be useful for author-based similarities.
    '''
    def proc_sims(sims, outfile):
        if issparse(sims):
            sims = sims.todense()
        print(f"shape of sims:{sims.shape}")
        print(f"len(subreddit_names.subreddit.values):{len(subreddit_names.subreddit.values)}",flush=True)
        sims = pd.DataFrame(sims)
        sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
        sims['_subreddit'] = subreddit_names.subreddit.values
        p = Path(outfile)
        output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
        output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
        output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
        p.parent.mkdir(exist_ok=True, parents=True)
        sims.to_feather(outfile)
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    entries, subreddit_names = reindex_tfidf(inpath, term_colname=term_colname, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN,from_date=from_date,to_date=to_date)
    mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new], entries.subreddit_id_new)))
    print("loading matrix")        
    #    mat = read_tfidf_matrix("term_tfidf_entries7ejhvnvl.parquet", term_colname)
    print(f'computing similarities on mat. mat.shape:{mat.shape}')
    print(f"size of mat is:{mat.data.nbytes}",flush=True)
    sims = simfunc(mat)
    del mat
    if hasattr(sims,'__next__'):
        for simmat, name in sims:
            proc_sims(simmat, Path(outfile)/(str(name) + ".feather"))
    else:
        proc_sims(sims, outfile)
 def write_weekly_similarities(path, sims, week, names):
    sims['week'] = week
    p = pathlib.Path(path)
    if not p.is_dir():
        p.mkdir(exist_ok=True,parents=True)
    # reformat as a pairwise list
    sims = sims.melt(id_vars=['_subreddit','week'],value_vars=names.subreddit.values)
    sims.to_parquet(p / week.isoformat())
 def column_overlaps(mat):
    non_zeros = (mat != 0).astype('double')
    intersection = non_zeros.T @ non_zeros
    card1 = non_zeros.sum(axis=0)
    den = np.add.outer(card1,card1) - intersection
    return intersection / den
 def test_lsi_sims():
    term = "term"
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    t1 = time.perf_counter()
    entries, subreddit_names = reindex_tfidf("/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k_repartitioned.parquet",
                                             term_colname='term',
                                             min_df=2000,
                                             topN=10000
                                             )
    t2 = time.perf_counter()
    print(f"first load took:{t2 - t1}s")
    entries, subreddit_names = reindex_tfidf("/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet",
                                             term_colname='term',
                                             min_df=2000,
                                             topN=10000
                                             )
    t3=time.perf_counter()
    print(f"second load took:{t3 - t2}s")
    mat = csr_matrix((entries['tf_idf'],(entries[term_id_new], entries.subreddit_id_new)))
    sims = list(lsi_column_similarities(mat, [10,50]))
    sims_og = sims
    sims_test = list(lsi_column_similarities(mat,[10,50],algorithm='randomized',n_iter=10))
 # n_components is the latent dimensionality. sklearn recommends 100. More might be better
 # if n_components is a list we'll return a list of similarities with different latent dimensionalities
 # if algorithm is 'randomized' instead of 'arpack' then n_iter gives the number of iterations.
 # this function takes the svd and then the column similarities of it
 def lsi_column_similarities(tfidfmat,n_components=300,n_iter=10,random_state=1968,algorithm='randomized',lsi_model_save=None,lsi_model_load=None):
    # first compute the lsi of the matrix
    # then take the column similarities
    if type(n_components) is int:
        n_components = [n_components]
    n_components = sorted(n_components,reverse=True)
    svd_components = n_components[0]
    if lsi_model_load is not None and Path(lsi_model_load).exists():
        print("loading LSI")
        mod = pickle.load(open(lsi_model_load ,'rb'))
        lsi_model_save = lsi_model_load
    else:
        print("running LSI",flush=True)
        svd = TruncatedSVD(n_components=svd_components,random_state=random_state,algorithm=algorithm,n_iter=n_iter)
        mod = svd.fit(tfidfmat.T)
    lsimat = mod.transform(tfidfmat.T)
    if lsi_model_save is not None:
        pickle.dump(mod, open(lsi_model_save,'wb'))
    sims_list = []
    for n_dims in n_components:
        sims = column_similarities(lsimat[:,np.arange(n_dims)])
        if len(n_components) > 1:
            yield (sims, n_dims)
        else:
            return sims
 def column_similarities(mat):
    return 1 - pairwise_distances(mat,metric='cosine')
 def build_weekly_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
    term = term_colname
    term_id = term + '_id'
    # aggregate counts by week. now subreddit-term is distinct
    df = df.filter(df.subreddit.isin(include_subs))
    df = df.groupBy(['subreddit',term,'week']).agg(f.sum('tf').alias('tf'))
    max_subreddit_terms = df.groupby(['subreddit','week']).max('tf') # subreddits are unique
    max_subreddit_terms = max_subreddit_terms.withColumnRenamed('max(tf)','sr_max_tf')
    df = df.join(max_subreddit_terms, on=['subreddit','week'])
    df = df.withColumn("relative_tf", df.tf / df.sr_max_tf)
    # group by term. term is unique
    idf = df.groupby([term,'week']).count()
    N_docs = df.select(['subreddit','week']).distinct().groupby(['week']).agg(f.count("subreddit").alias("subreddits_in_week"))
    idf = idf.join(N_docs, on=['week'])
    # add a little smoothing to the idf
    idf = idf.withColumn('idf',f.log(idf.subreddits_in_week) / (1+f.col('count'))+1)
    # collect the dictionary to make a pydict of terms to indexes
    terms = idf.select([term]).distinct() # terms are distinct
    terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
    # make subreddit ids
    subreddits = df.select(['subreddit']).distinct()
    subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
    df = df.join(subreddits,on=['subreddit'])
    # map terms to indexes in the tfs and the idfs
    df = df.join(terms,on=[term]) # subreddit-term-id is unique
    idf = idf.join(terms,on=[term])
    # join on subreddit/term to create tf/dfs indexed by term
    df = df.join(idf, on=[term_id, term,'week'])
    # agg terms by subreddit to make sparse tf/df vectors
    if tf_family == tf_weight.MaxTF:
        df = df.withColumn("tf_idf",  df.relative_tf * df.idf)
    else: # tf_fam = tf_weight.Norm05
        df = df.withColumn("tf_idf",  (0.5 + 0.5 * df.relative_tf) * df.idf)
    df = df.repartition(400,'subreddit','week')
    dfwriter = df.write.partitionBy("week")
    return dfwriter
 def _calc_tfidf(df, term_colname, tf_family):
    term = term_colname
    term_id = term + '_id'
    max_subreddit_terms = df.groupby(['subreddit']).max('tf') # subreddits are unique
    max_subreddit_terms = max_subreddit_terms.withColumnRenamed('max(tf)','sr_max_tf')
    df = df.join(max_subreddit_terms, on='subreddit')
    df = df.withColumn("relative_tf", (df.tf / df.sr_max_tf))
    # group by term. term is unique
    idf = df.groupby([term]).count()
    N_docs = df.select('subreddit').distinct().count()
    # add a little smoothing to the idf
    idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1)
    # collect the dictionary to make a pydict of terms to indexes
    terms = idf.select(term).distinct() # terms are distinct
    terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
    # make subreddit ids
    subreddits = df.select(['subreddit']).distinct()
    subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
    df = df.join(subreddits,on='subreddit')
    # map terms to indexes in the tfs and the idfs
    df = df.join(terms,on=term) # subreddit-term-id is unique
    idf = idf.join(terms,on=term)
    # join on subreddit/term to create tf/dfs indexed by term
    df = df.join(idf, on=[term_id, term])
    # agg terms by subreddit to make sparse tf/df vectors
    if tf_family == tf_weight.MaxTF:
        df = df.withColumn("tf_idf",  df.relative_tf * df.idf)
    else: # tf_fam = tf_weight.Norm05
        df = df.withColumn("tf_idf",  (0.5 + 0.5 * df.relative_tf) * df.idf)
    return df
 def tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
    term = term_colname
    term_id = term + '_id'
    # aggregate counts by week. now subreddit-term is distinct
    df = df.filter(df.subreddit.isin(include_subs))
    df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf'))
    df = _calc_tfidf(df, term_colname, tf_family)
    df = df.repartition('subreddit')
    dfwriter = df.write
    return dfwriter
 def select_topN_subreddits(topN, path="/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nonsfw.csv"):
    rankdf = pd.read_csv(path)
    included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
    return included_subreddits
 def repartition_tfidf(inpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k.parquet",
                      outpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_100k_repartitioned.parquet"):
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(inpath)
    df = df.repartition(400,'subreddit')
    df.write.parquet(outpath,mode='overwrite')
 def repartition_tfidf_weekly(inpath="/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet",
                      outpath="/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms_repartitioned.parquet"):
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(inpath)
    df = df.repartition(400,'subreddit','week')
    dfwriter = df.write.partitionBy("week")
    dfwriter.parquet(outpath,mode='overwrite')
--- a/similarities/tfidf.py
+++ b/similarities/tfidf.py
@@ -0,0 +1,88 @@
 import fire
 from pyspark.sql import SparkSession
 from pyspark.sql import functions as f
 from cdsc_ecology_utils.similarity.similarity_functions import tfidf_dataset, \
    build_weekly_tfidf_dataset, select_topN_communities
 def _tfidf_wrapper(func, inpath, outpath, topN, term_colname, exclude, included_subreddits):
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(inpath)
    df = df.filter(~ f.col(term_colname).isin(exclude))
    if included_subreddits is not None:
        include_subs = set(map(str.strip,open(included_subreddits)))
    else:
        include_subs = select_topN_communities(topN)
    dfwriter = func(df, include_subs, term_colname)
    dfwriter.parquet(outpath,mode='overwrite',compression='snappy')
    spark.stop()
 def tfidf(inpath, outpath, topN, term_colname, exclude, included_subreddits):
    return _tfidf_wrapper(tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits)
 def tfidf_weekly(inpath, outpath, topN, term_colname, exclude, included_subreddits):
    return _tfidf_wrapper(build_weekly_tfidf_dataset, inpath, outpath, topN, term_colname, exclude, included_subreddits)
 def tfidf_authors(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
                  outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet',
                  topN=None,
                  included_subreddits=None):
    return tfidf(inpath,
                 outpath,
                 topN,
                 'author',
                 ['[deleted]','AutoModerator'],
                 included_subreddits=included_subreddits
                 )
 def tfidf_terms(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
                outpath='/gscratch/comdata/output/reddit_similarity/tfidf/comment_terms.parquet',
                topN=None,
                included_subreddits=None):
    return tfidf(inpath,
                 outpath,
                 topN,
                 'term',
                 [],
                 included_subreddits=included_subreddits
                 )
 def tfidf_authors_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_authors.parquet",
                         outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors.parquet',
                         topN=None,
                         included_subreddits=None):
    return tfidf_weekly(inpath,
                        outpath,
                        topN,
                        'author',
                        ['[deleted]','AutoModerator'],
                        included_subreddits=included_subreddits
                        )
 def tfidf_terms_weekly(inpath="/gscratch/comdata/output/reddit_ngrams/comment_terms.parquet",
                       outpath='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet',
                       topN=None,
                       included_subreddits=None):
    return tfidf_weekly(inpath,
                        outpath,
                        topN,
                        'term',
                        [],
                        included_subreddits=included_subreddits
                        )
 if __name__ == "__main__":
    fire.Fire({'authors':tfidf_authors,
               'terms':tfidf_terms,
               'authors_weekly':tfidf_authors_weekly,
               'terms_weekly':tfidf_terms_weekly})
--- a/similarities/top_subreddits_by_comments.py
+++ b/similarities/top_subreddits_by_comments.py
@@ -1,18 +1,14 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
 import numpy as np
 import pyarrow
 import pandas as pd
 import fire
 from itertools import islice
 from pathlib import Path
 from similarities_helper import cosine_similarities
 spark = SparkSession.builder.getOrCreate()
 conf = spark.sparkContext.getConf()
 submissions = spark.read.parquet("/gscratch/comdata/output/reddit_submissions_by_subreddit.parquet")
 prop_nsfw = submissions.select(['subreddit','over_18']).groupby('subreddit').agg(f.mean(f.col('over_18').astype('double')).alias('prop_nsfw'))
 df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
 # remove /u/ pages
@@ -20,11 +16,14 @@ df = df.filter(~df.subreddit.like("u_%"))
 df = df.groupBy('subreddit').agg(f.count('id').alias("n_comments"))
 df = df.join(prop_nsfw,on='subreddit')
 #df = df.filter(df.prop_nsfw < 0.5)
 win = Window.orderBy(f.col('n_comments').desc())
-df = df.withColumn('comments_rank',f.rank().over(win))
+df = df.withColumn('comments_rank', f.rank().over(win))
 df = df.toPandas()
 df = df.sort_values("n_comments")
-df.to_csv('/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv',index=False)
+df.to_csv('/gscratch/comdata/output/reddit_similarity/subreddits_by_num_comments_nsfw.csv', index=False)
--- a/similarities/wang_similarity.py
+++ b/similarities/wang_similarity.py
@@ -0,0 +1,18 @@
 from similarities_helper import similarities
 import numpy as np
 import fire 
 def wang_similarity(mat):
    non_zeros = (mat != 0).astype(np.float32)
    intersection = non_zeros.T @ non_zeros
    return intersection
 infile="/gscratch/comdata/output/reddit_similarity/tfidf/comment_authors.parquet"; outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather"; min_df=1; included_subreddits=None; topN=10000; exclude_phrases=False; from_date=None; to_date=None
 def wang_overlaps(infile, outfile="/gscratch/comdata/output/reddit_similarity/wang_similarity_10000.feather", min_df=1, max_df=None, included_subreddits=None, topN=10000, exclude_phrases=False, from_date=None, to_date=None):
    return similarities(infile=infile, simfunc=wang_similarity, term_colname='author', outfile=outfile, min_df=min_df, max_df=max_df, included_subreddits=included_subreddits, topN=topN, exclude_phrases=exclude_phrases, from_date=from_date, to_date=to_date)
 if __name__ == "__main__":
    fire.Fire(wang_overlaps)
--- a/similarities/weekly_cosine_similarities.py
+++ b/similarities/weekly_cosine_similarities.py
@@ -0,0 +1,143 @@
 #!/usr/bin/env python3
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 import numpy as np
 import pyarrow
 import pyarrow.dataset as ds
 import pandas as pd
 import fire
 from itertools import islice, chain
 from pathlib import Path
 from similarities_helper import pull_tfidf, column_similarities, write_weekly_similarities, lsi_column_similarities
 from scipy.sparse import csr_matrix
 from multiprocessing import Pool, cpu_count
 from functools import partial
 infile = "/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_10k.parquet"
 tfidf_path = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf/comment_authors_compex.parquet"
 min_df=None
 included_subreddits="/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/included_subreddits.txt"
 max_df = None
 topN=100
 term_colname='author'
 # outfile = '/gscratch/comdata/output/reddit_similarity/weekly/comment_authors_test.parquet'
 # included_subreddits=None
 def _week_similarities(week, simfunc, tfidf_path, term_colname, min_df, max_df, included_subreddits, topN, outdir:Path, subreddit_names, nterms):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    print(f"loading matrix: {week}")
    entries = pull_tfidf(infile = tfidf_path,
                         term_colname=term_colname,
                         min_df=min_df,
                         max_df=max_df,
                         included_subreddits=included_subreddits,
                         topN=topN,
                         week=week,
                         rescale_idf=False)
    tfidf_colname='tf_idf'
    # if the max subreddit id we found is less than the number of subreddit names then we have to fill in 0s
    mat = csr_matrix((entries[tfidf_colname],(entries[term_id_new]-1, entries.subreddit_id_new-1)),shape=(nterms,subreddit_names.shape[0]))
    print('computing similarities')
    sims = simfunc(mat)
    del mat
    sims = pd.DataFrame(sims)
    sims = sims.rename({i: sr for i, sr in enumerate(subreddit_names.subreddit.values)}, axis=1)
    sims['_subreddit'] = subreddit_names.subreddit.values
    outfile = str(Path(outdir) / str(week))
    write_weekly_similarities(outfile, sims, week, subreddit_names)
 def pull_weeks(batch):
    return set(batch.to_pandas()['week'])
 # This requires a prefit LSI model, since we shouldn't fit different LSI models for every week. 
 def cosine_similarities_weekly_lsi(n_components=100, lsi_model=None, *args, **kwargs):
    term_colname= kwargs.get('term_colname')
    #lsi_model = "/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/similarity/comment_terms_compex_LSI/1000_term_LSIMOD.pkl"
    # simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=n_iter,random_state=random_state,algorithm='randomized',lsi_model_load=lsi_model)
    simfunc = partial(lsi_column_similarities,n_components=n_components,n_iter=kwargs.get('n_iter'),random_state=kwargs.get('random_state'),algorithm=kwargs.get('algorithm'),lsi_model_load=lsi_model)
    return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
 #tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_weekly.parquet')
 def cosine_similarities_weekly(tfidf_path, outfile, term_colname, min_df = None, max_df=None, included_subreddits = None, topN = 500, simfunc=column_similarities):
    print(outfile)
    # do this step in parallel if we have the memory for it.
    # should be doable with pool.map
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet(tfidf_path)
    # load subreddits + topN
    subreddit_names = df.select(['subreddit','subreddit_id']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id")
    nterms = df.select(f.max(f.col(term_colname + "_id")).alias('max')).collect()[0].max
    weeks = df.select(f.col("week")).distinct().toPandas().week.values
    spark.stop()
    print(f"computing weekly similarities")
    week_similarities_helper = partial(_week_similarities,simfunc=simfunc, tfidf_path=tfidf_path, term_colname=term_colname, outdir=outfile, min_df=min_df,max_df=max_df,included_subreddits=included_subreddits,topN=topN, subreddit_names=subreddit_names,nterms=nterms)
    pool = Pool(cpu_count())
    list(pool.imap(week_similarities_helper,weeks))
    pool.close()
    #    with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
 def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500):
    return cosine_similarities_weekly(infile,
                                      outfile,
                                      'author',
                                      min_df,
                                      max_df,
                                      included_subreddits,
                                      topN)
 def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None):
        return cosine_similarities_weekly(infile,
                                          outfile,
                                          'term',
                                          min_df,
                                          max_df,
                                          included_subreddits,
                                          topN)
 def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=None,n_components=100,lsi_model=None):
    return cosine_similarities_weekly_lsi(infile,
                                          outfile,
                                          'author',
                                          min_df,
                                          max_df,
                                          included_subreddits,
                                          topN,
                                          n_components=n_components,
                                          lsi_model=lsi_model)
 def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=500,n_components=100,lsi_model=None):
        return cosine_similarities_weekly_lsi(infile,
                                              outfile,
                                              'term',
                                              min_df,
                                              max_df,
                                              included_subreddits,
                                              topN,
                                              n_components=n_components,
                                              lsi_model=lsi_model)
 if __name__ == "__main__":
    fire.Fire({'authors':author_cosine_similarities_weekly,
               'terms':term_cosine_similarities_weekly,
               'authors-lsi':author_cosine_similarities_weekly_lsi,
               'terms-lsi':term_cosine_similarities_weekly
               })
--- a/similarities_helper.py
+++ b/similarities_helper.py
@@ -1,172 +0,0 @@
 from pyspark.sql import Window
 from pyspark.sql import functions as f
 from enum import Enum
 from pyspark.mllib.linalg.distributed import CoordinateMatrix
 from tempfile import TemporaryDirectory
 import pyarrow
 import pyarrow.dataset as ds
 from scipy.sparse import csr_matrix
 import pandas as pd
 import numpy as np
 class tf_weight(Enum):
    MaxTF = 1
    Norm05 = 2
 def read_tfidf_matrix(path,term_colname):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    dataset = ds.dataset(path,format='parquet')
    entries = dataset.to_table(columns=['tf_idf','subreddit_id_new',term_id_new]).to_pandas()
    return(csr_matrix((entries.tf_idf,(entries[term_id_new]-1, entries.subreddit_id_new-1))))
 def column_similarities(mat):
    norm = np.matrix(np.power(mat.power(2).sum(axis=0),0.5,dtype=np.float32))
    mat = mat.multiply(1/norm)
    sims = mat.T @ mat
    return(sims)
 def prep_tfidf_entries(tfidf, term_colname, min_df, included_subreddits):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    if min_df is None:
        min_df = 0.1 * len(included_subreddits)
    tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
    # reset the subreddit ids
    sub_ids = tfidf.select('subreddit_id').distinct()
    sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.orderBy("subreddit_id")))
    tfidf = tfidf.join(sub_ids,'subreddit_id')
    # only use terms in at least min_df included subreddits
    new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
 #    new_count = new_count.filter(f.col('new_count') >= min_df)
    tfidf = tfidf.join(new_count,term_id,how='inner')
    # reset the term ids
    term_ids = tfidf.select([term_id]).distinct()
    term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.orderBy(term_id)))
    tfidf = tfidf.join(term_ids,term_id)
    tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
    # tfidf = tfidf.withColumnRenamed("idf","idf_old")
    # tfidf = tfidf.withColumn("idf",f.log(25000/f.col("count")))
    tfidf = tfidf.withColumn("tf_idf", (tfidf.relative_tf * tfidf.idf).cast('float'))
    tempdir =TemporaryDirectory(suffix='.parquet',prefix='term_tfidf_entries',dir='.')
    tfidf.write.parquet(tempdir.name,mode='overwrite',compression='snappy')
    return tempdir
 def cosine_similarities(tfidf, term_colname, min_df, included_subreddits, similarity_threshold):
    term = term_colname
    term_id = term + '_id'
    term_id_new = term + '_id_new'
    if min_df is None:
        min_df = 0.1 * len(included_subreddits)
    tfidf = tfidf.filter(f.col("subreddit").isin(included_subreddits))
    tfidf = tfidf.cache()
    # reset the subreddit ids
    sub_ids = tfidf.select('subreddit_id').distinct()
    sub_ids = sub_ids.withColumn("subreddit_id_new",f.row_number().over(Window.orderBy("subreddit_id")))
    tfidf = tfidf.join(sub_ids,'subreddit_id')
    # only use terms in at least min_df included subreddits
    new_count = tfidf.groupBy(term_id).agg(f.count(term_id).alias('new_count'))
 #    new_count = new_count.filter(f.col('new_count') >= min_df)
    tfidf = tfidf.join(new_count,term_id,how='inner')
    # reset the term ids
    term_ids = tfidf.select([term_id]).distinct()
    term_ids = term_ids.withColumn(term_id_new,f.row_number().over(Window.orderBy(term_id)))
    tfidf = tfidf.join(term_ids,term_id)
    tfidf = tfidf.withColumnRenamed("tf_idf","tf_idf_old")
    # tfidf = tfidf.withColumnRenamed("idf","idf_old")
    # tfidf = tfidf.withColumn("idf",f.log(25000/f.col("count")))
    tfidf = tfidf.withColumn("tf_idf", tfidf.relative_tf * tfidf.idf)
    # step 1 make an rdd of entires
    # sorted by (dense) spark subreddit id
    #    entries = tfidf.filter((f.col('subreddit') == 'asoiaf') | (f.col('subreddit') == 'gameofthrones') | (f.col('subreddit') == 'christianity')).select(f.col("term_id_new")-1,f.col("subreddit_id_new")-1,"tf_idf").rdd
    n_partitions = int(len(included_subreddits)*2 / 5)
    entries = tfidf.select(f.col(term_id_new)-1,f.col("subreddit_id_new")-1,"tf_idf").rdd.repartition(n_partitions)
    # put like 10 subredis in each partition
    # step 2 make it into a distributed.RowMatrix
    coordMat = CoordinateMatrix(entries)
    coordMat = CoordinateMatrix(coordMat.entries.repartition(n_partitions))
    # this needs to be an IndexedRowMatrix()
    mat = coordMat.toRowMatrix()
    #goal: build a matrix of subreddit columns and tf-idfs rows
    sim_dist = mat.columnSimilarities(threshold=similarity_threshold)
    return (sim_dist, tfidf)
 def build_tfidf_dataset(df, include_subs, term_colname, tf_family=tf_weight.Norm05):
    term = term_colname
    term_id = term + '_id'
    # aggregate counts by week. now subreddit-term is distinct
    df = df.filter(df.subreddit.isin(include_subs))
    df = df.groupBy(['subreddit',term]).agg(f.sum('tf').alias('tf'))
    max_subreddit_terms = df.groupby(['subreddit']).max('tf') # subreddits are unique
    max_subreddit_terms = max_subreddit_terms.withColumnRenamed('max(tf)','sr_max_tf')
    df = df.join(max_subreddit_terms, on='subreddit')
    df = df.withColumn("relative_tf", df.tf / df.sr_max_tf)
    # group by term. term is unique
    idf = df.groupby([term]).count()
    N_docs = df.select('subreddit').distinct().count()
    # add a little smoothing to the idf
    idf = idf.withColumn('idf',f.log(N_docs/(1+f.col('count')))+1)
    # collect the dictionary to make a pydict of terms to indexes
    terms = idf.select(term).distinct() # terms are distinct
    terms = terms.withColumn(term_id,f.row_number().over(Window.orderBy(term))) # term ids are distinct
    # make subreddit ids
    subreddits = df.select(['subreddit']).distinct()
    subreddits = subreddits.withColumn('subreddit_id',f.row_number().over(Window.orderBy("subreddit")))
    df = df.join(subreddits,on='subreddit')
    # map terms to indexes in the tfs and the idfs
    df = df.join(terms,on=term) # subreddit-term-id is unique
    idf = idf.join(terms,on=term)
    # join on subreddit/term to create tf/dfs indexed by term
    df = df.join(idf, on=[term_id, term])
    # agg terms by subreddit to make sparse tf/df vectors
    if tf_family == tf_weight.MaxTF:
        df = df.withColumn("tf_idf",  df.relative_tf * df.idf)
    else: # tf_fam = tf_weight.Norm05
        df = df.withColumn("tf_idf",  (0.5 + 0.5 * df.relative_tf) * df.idf)
    return df
--- a/term_cosine_similarity.py
+++ b/term_cosine_similarity.py
@@ -1,127 +0,0 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from pyspark.mllib.linalg.distributed import RowMatrix, CoordinateMatrix
 import numpy as np
 import pyarrow
 import pandas as pd
 import fire
 from itertools import islice
 from pathlib import Path
 from similarities_helper import cosine_similarities, prep_tfidf_entries, read_tfidf_matrix, column_similarities
 import scipy
 # outfile='test_similarities_500.feather';
 # min_df = None;
 # included_subreddits=None; topN=100; exclude_phrases=True;
 def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, topN=500, exclude_phrases=False):
    spark = SparkSession.builder.getOrCreate()
    conf = spark.sparkContext.getConf()
    print(outfile)
    print(exclude_phrases)
    tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet')
    if included_subreddits is None:
        rankdf = pd.read_csv("/gscratch/comdata/users/nathante/cdsc-reddit/subreddits_by_num_comments.csv")
        included_subreddits = set(rankdf.loc[rankdf.comments_rank <= topN,'subreddit'].values)
    else:
        included_subreddits = set(open(included_subreddits))
    if exclude_phrases == True:
        tfidf = tfidf.filter(~f.col(term).contains("_"))
    print("creating temporary parquet with matrix indicies")
    tempdir = prep_tfidf_entries(tfidf, 'term', min_df, included_subreddits)
    tfidf = spark.read.parquet(tempdir.name)
    subreddit_names = tfidf.select(['subreddit','subreddit_id_new']).distinct().toPandas()
    subreddit_names = subreddit_names.sort_values("subreddit_id_new")
    subreddit_names['subreddit_id_new'] = subreddit_names['subreddit_id_new'] - 1
    spark.stop()
    print("loading matrix")
    mat = read_tfidf_matrix(tempdir.name,'term')
    print('computing similarities')
    sims = column_similarities(mat)
    del mat
    sims = pd.DataFrame(sims.todense())
    sims = sims.rename({i:sr for i, sr in enumerate(subreddit_names.subreddit.values)},axis=1)
    sims['subreddit'] = subreddit_names.subreddit.values
    p = Path(outfile)
    output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
    output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
    output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
    sims.to_feather(outfile)
    tempdir.cleanup()
    path = "term_tfidf_entriesaukjy5gv.parquet"
 # outfile = '/gscratch/comdata/users/nathante/test_similarities_500.feather'; min_df = None; included_subreddits=None; similarity_threshold=0;
 # def term_cosine_similarities(outfile, min_df = None, included_subreddits=None, similarity_threshold=0, topN=500, exclude_phrases=True):
 #     '''
 #     Compute similarities between subreddits based on tfi-idf vectors of comment texts 
 #     included_subreddits : string
 #         Text file containing a list of subreddits to include (one per line) if included_subreddits is None then do the top 500 subreddits
 #     similarity_threshold : double (default = 0)
 #         set > 0 for large numbers of subreddits to get an approximate solution using the DIMSUM algorithm
 # https://stanford.edu/~rezab/papers/dimsum.pdf. If similarity_threshold=0 we get an exact solution using an O(N^2) algorithm.
 #     min_df : int (default = 0.1 * (number of included_subreddits)
 #          exclude terms that appear in fewer than this number of documents.
 #     outfile: string
 #          where to output csv and feather outputs
 # '''
 #     print(outfile)
 #     print(exclude_phrases)
 #     tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet')
 #     if included_subreddits is None:
 #         included_subreddits = list(islice(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"),topN))
 #         included_subreddits = {s.strip('\n') for s in included_subreddits}
 #     else:
 #         included_subreddits = set(open(included_subreddits))
 #     if exclude_phrases == True:
 #         tfidf = tfidf.filter(~f.col(term).contains("_"))
 #     sim_dist, tfidf = cosine_similarities(tfidf, 'term', min_df, included_subreddits, similarity_threshold)
 #     p = Path(outfile)
 #     output_feather =  Path(str(p).replace("".join(p.suffixes), ".feather"))
 #     output_csv =  Path(str(p).replace("".join(p.suffixes), ".csv"))
 #     output_parquet =  Path(str(p).replace("".join(p.suffixes), ".parquet"))
 #     sim_dist.entries.toDF().write.parquet(str(output_parquet),mode='overwrite',compression='snappy')
 #     #instead of toLocalMatrix() why not read as entries and put strait into numpy
 #     sim_entries = pd.read_parquet(output_parquet)
 #     df = tfidf.select('subreddit','subreddit_id_new').distinct().toPandas()
 #     spark.stop()
 #     df['subreddit_id_new'] = df['subreddit_id_new'] - 1
 #     df = df.sort_values('subreddit_id_new').reset_index(drop=True)
 #     df = df.set_index('subreddit_id_new')
 #     similarities = sim_entries.join(df, on='i')
 #     similarities = similarities.rename(columns={'subreddit':"subreddit_i"})
 #     similarities = similarities.join(df, on='j')
 #     similarities = similarities.rename(columns={'subreddit':"subreddit_j"})
 #     similarities.to_feather(output_feather)
 #     similarities.to_csv(output_csv)
 #     return similarities
 if __name__ == '__main__':
    fire.Fire(term_cosine_similarities)
--- a/tfidf_authors.py
+++ b/tfidf_authors.py
@@ -1,19 +0,0 @@
 from pyspark.sql import SparkSession
 from similarities_helper import build_tfidf_dataset
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test_authors.parquet_temp")
 include_subs = set(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"))
 include_subs = {s.strip('\n') for s in include_subs}
 # remove [deleted] and AutoModerator (TODO remove other bots)
 df = df.filter(df.author != '[deleted]')
 df = df.filter(df.author != 'AutoModerator')
 df = build_tfidf_dataset(df, include_subs, 'author')
 df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf_authors.parquet',mode='overwrite',compression='snappy')
 spark.stop()
--- a/tfidf_comments.py
+++ b/tfidf_comments.py
@@ -1,18 +0,0 @@
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from pyspark.sql import Window
 from similarities_helper import build_tfidf_dataset
 ## TODO:need to exclude automoderator / bot posts.
 ## TODO:need to exclude better handle hyperlinks. 
 spark = SparkSession.builder.getOrCreate()
 df = spark.read.parquet("/gscratch/comdata/users/nathante/reddit_tfidf_test.parquet_temp")
 include_subs = set(open("/gscratch/comdata/users/nathante/cdsc-reddit/top_25000_subs_by_comments.txt"))
 include_subs = {s.strip('\n') for s in include_subs}
 df = build_tfidf_dataset(df, include_subs, 'term')
 df.write.parquet('/gscratch/comdata/users/nathante/subreddit_tfidf.parquet',mode='overwrite',compression='snappy')
 spark.stop()
--- a/timeseries/init.py
+++ b/timeseries/init.py
@@ -0,0 +1,2 @@
 from .choose_clusters import load_clusters, load_densities
 from .cluster_timeseries import build_cluster_timeseries
--- a/timeseries/choose_clusters.py
+++ b/timeseries/choose_clusters.py
@@ -0,0 +1,96 @@
 from pyarrow import dataset as ds
 import numpy as np
 import pandas as pd
 import plotnine as pn
 random = np.random.RandomState(1968)
 def load_densities(term_density_file="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
                   author_density_file="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather"):
    term_density = pd.read_feather(term_density_file)
    author_density = pd.read_feather(author_density_file)
    term_density.rename({'overlap_density':'term_density','index':'subreddit'},axis='columns',inplace=True)
    author_density.rename({'overlap_density':'author_density','index':'subreddit'},axis='columns',inplace=True)
    density = term_density.merge(author_density,on='subreddit',how='inner')
    return density
 def load_clusters(term_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
                  author_clusters_file="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather"):
    term_clusters = pd.read_feather(term_clusters_file)
    author_clusters = pd.read_feather(author_clusters_file)
    # rename, join and return
    term_clusters.rename({'cluster':'term_cluster'},axis='columns',inplace=True)
    author_clusters.rename({'cluster':'author_cluster'},axis='columns',inplace=True)
    clusters = term_clusters.merge(author_clusters,on='subreddit',how='inner')
    return clusters
 if __name__ == '__main__':
    df = load_densities()
    cl = load_clusters()
    df['td_rank'] = df.term_density.rank()
    df['ad_rank'] = df.author_density.rank()
    df['td_percentile'] = df.td_rank / df.shape[0]
    df['ad_percentile'] = df.ad_rank / df.shape[0]
    df = df.merge(cl, on='subreddit',how='inner')
    term_cluster_density = df.groupby('term_cluster').agg({'td_rank':['mean','min','max'],
                                                         'ad_rank':['mean','min','max'],
                                                         'td_percentile':['mean','min','max'],
                                                           'ad_percentile':['mean','min','max'],
                                                           'subreddit':['count']})
    author_cluster_density = df.groupby('author_cluster').agg({'td_rank':['mean','min','max'],
                                                         'ad_rank':['mean','min','max'],
                                                         'td_percentile':['mean','min','max'],
                                                           'ad_percentile':['mean','min','max'],
                                                           'subreddit':['count']})
    # which clusters have the most term_density?
    term_cluster_density.iloc[term_cluster_density.td_rank['mean'].sort_values().index]
    # which clusters have the most author_density?
    term_cluster_density.iloc[term_cluster_density.ad_rank['mean'].sort_values(ascending=False).index].loc[term_cluster_density.subreddit['count'] >= 5][0:20]
    high_density_term_clusters = term_cluster_density.loc[(term_cluster_density.td_percentile['mean'] > 0.75) & (term_cluster_density.subreddit['count'] > 5)]
    # let's just use term density instead of author density for now. We can do a second batch with author density next.
    chosen_clusters = high_density_term_clusters.sample(3,random_state=random)
    cluster_info = df.loc[df.term_cluster.isin(chosen_clusters.index.values)]
    chosen_subreddits = cluster_info.subreddit.values
    dataset = ds.dataset("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet",format='parquet')
    comments = dataset.to_table(filter=ds.field("subreddit").isin(chosen_subreddits),columns=['id','subreddit','author','CreatedAt'])
    comments = comments.to_pandas()
    comments['week'] = comments.CreatedAt.dt.date - pd.to_timedelta(comments['CreatedAt'].dt.dayofweek, unit='d')
    author_timeseries = comments.loc[:,['subreddit','author','week']].drop_duplicates().groupby(['subreddit','week']).count().reset_index()
    for clid in chosen_clusters.index.values:
        ts = pd.read_feather(f"data/ts_term_cluster_{clid}.feather")
        pn.options.figure_size = (11.7,8.27)
        p = pn.ggplot(ts)
        p = p + pn.geom_line(pn.aes('week','value',group='subreddit'))
        p = p + pn.facet_wrap('~ subreddit')
        p.save(f"plots/ts_term_cluster_{clid}.png")
        fig, ax = pyplot.subplots(figsize=(11.7,8.27))
        g = sns.FacetGrid(ts,row='subreddit')
        g.map_dataframe(sns.scatterplot,'week','value',data=ts,ax=ax)
--- a/timeseries/cluster_timeseries.py
+++ b/timeseries/cluster_timeseries.py
@@ -0,0 +1,37 @@
 import pandas as pd
 import numpy as np
 from pyspark.sql import functions as f
 from pyspark.sql import SparkSession
 from .choose_clusters import load_clusters, load_densities
 import fire
 from pathlib import Path
 def build_cluster_timeseries(term_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_terms_10000.feather",
         author_clusters_path="/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather",
         term_densities_path="/gscratch/comdata/output/reddit_density/comment_terms_10000.feather",
         author_densities_path="/gscratch/comdata/output/reddit_density/comment_authors_10000.feather",
         output="data/subreddit_timeseries.parquet"):
    clusters = load_clusters(term_clusters_path, author_clusters_path)
    densities = load_densities(term_densities_path, author_densities_path)
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.parquet("/gscratch/comdata/output/reddit_comments_by_subreddit.parquet")
    df = df.withColumn('week', f.date_trunc('week', f.col("CreatedAt")))
    # time of unique authors by series by week
    ts = df.select(['subreddit','week','author']).distinct().groupby(['subreddit','week']).count()
    ts = ts.repartition('subreddit')
    spk_clusters = spark.createDataFrame(clusters)
    ts = ts.join(spk_clusters, on='subreddit', how='inner')
    spk_densities = spark.createDataFrame(densities)
    ts = ts.join(spk_densities, on='subreddit', how='inner')
    ts.write.parquet(output, mode='overwrite')
 if __name__ == "__main__":
    fire.Fire(build_cluster_timeseries)
--- a/visualization/Makefile
+++ b/visualization/Makefile
@@ -0,0 +1,11 @@
 all: subreddit_author_tf_similarities_10000.html #comment_authors_10000.html
 # wang_tsne_10000.html
 # wang_tsne_10000.html:/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather /gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather tsne_vis.py
 # 	python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/wang_similarity_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/wang_similarity_10000.feather --output=wang_tsne_10000.html
 # comment_authors_10000.html:/gscratch/comdata/output/reddit_tsne/comment_authors_10000.feather /gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather tsne_vis.py
 # 	python3 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_similarity/comment_authors_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/comment_authors_10000.feather --output=comment_authors_10000.html
 subreddit_author_tf_similarities_10000.html:/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather /gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather tsne_vis.py
 	start_spark_and_run.sh 1 tsne_vis.py --tsne_data=/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather --clusters=/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather --output=subreddit_author_tf_similarities_10000.html
--- a/visualization/subreddit_author_tf_similarities_10000.html
+++ b/visualization/subreddit_author_tf_similarities_10000.html
--- a/visualization/subreddit_author_tf_similarities_10000_viewport.html
+++ b/visualization/subreddit_author_tf_similarities_10000_viewport.html
--- a/visualization/tsne_vis.py
+++ b/visualization/tsne_vis.py
@@ -5,21 +5,48 @@ alt.data_transformers.enable('default')
 from sklearn.neighbors import NearestNeighbors
 import pandas as pd
 from numpy import random
 import fire
 import numpy as np
 def base_plot(plot_data):
 #    base = base.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')))
    cluster_dropdown = alt.binding_select(options=[str(c) for c in sorted(set(plot_data.cluster))])
    #    subreddit_dropdown = alt.binding_select(options=sorted(plot_data.subreddit))
    cluster_click_select = alt.selection_single(on='click',fields=['cluster'], bind=cluster_dropdown, name=' ')
    # cluster_select = alt.selection_single(fields=['cluster'], bind=cluster_dropdown, name='cluster')
    # cluster_select_and = cluster_click_select & cluster_select
    #
    #    subreddit_select = alt.selection_single(on='click',fields=['subreddit'],bind=subreddit_dropdown,name='subreddit_click')
    base_scale = alt.Scale(scheme={"name":'category10',
                                   "extent":[0,100],
                                   "count":10})
    color = alt.condition(cluster_click_select ,
                          alt.Color(field='color',type='nominal',scale=base_scale),
                          alt.value("lightgray"))
    base = alt.Chart(plot_data).mark_text().encode(
        alt.X('x',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
        alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=(-65,65))),
        color=color,
        text='subreddit')
    base = base.add_selection(cluster_click_select)
    return base
 def zoom_plot(plot_data):
    chart = base_plot(plot_data)
-    chart = chart.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')))
+
    chart = chart.interactive()
-    chart = chart.properties(width=1275,height=1000)
+    chart = chart.properties(width=1275,height=800)
    return chart
@@ -51,7 +78,7 @@ def viewport_plot(plot_data):
                     alt.Y('y',axis=alt.Axis(grid=False),scale=alt.Scale(domain=selectory2))
    )
-    sr = sr.encode(alt.Color(field='color',type='nominal',scale=alt.Scale(scheme='category10')))
+
    sr = sr.properties(width=1275,height=600)
@@ -61,6 +88,11 @@ def viewport_plot(plot_data):
    return chart
 def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
    isolate_color = 101
    cluster_sizes = clusters.groupby('cluster').count()
    singletons = set(cluster_sizes.loc[cluster_sizes.subreddit == 1].reset_index().cluster)
    tsne_data = tsne_data.merge(clusters,on='subreddit')
    centroids = tsne_data.groupby('cluster').agg({'x':np.mean,'y':np.mean})
@@ -70,28 +102,44 @@ def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
    distances = np.empty(shape=(centroids.shape[0],centroids.shape[0]))
    groups = tsne_data.groupby('cluster')
-    for centroid in centroids.itertuples():
+    
-        c_dists = groups.apply(lambda r: min(np.sqrt(np.square(centroid.x - r.x) + np.square(centroid.y-r.y))))
+    points = np.array(tsne_data.loc[:,['x','y']])
-        distances[:,centroid.Index] = c_dists
+    centers = np.array(centroids.loc[:,['x','y']])
    # point x centroid
    point_center_distances = np.linalg.norm((points[:,None,:] - centers[None,:,:]),axis=-1)
    # distances is cluster x point
    for gid, group in groups:
        c_dists = point_center_distances[group.index.values,:].min(axis=0)
        distances[group.cluster.values[0],] = c_dists        
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors).fit(centroids) 
    # distances, indices = nbrs.kneighbors()
-    nbrs = NearestNeighbors(n_neighbors=n_neighbors,metric='precomputed').fit(distances) 
+    nearest = distances.argpartition(n_neighbors,0)
-    distances, indices = nbrs.kneighbors()
+    indices = nearest[:n_neighbors,:].T
    # neighbor_distances = np.copy(distances)
    # neighbor_distances.sort(0)
    # neighbor_distances = neighbor_distances[0:n_neighbors,:]
    # nbrs = NearestNeighbors(n_neighbors=n_neighbors,metric='precomputed').fit(distances) 
    # distances, indices = nbrs.kneighbors()
    color_assignments = np.repeat(-1,len(centroids))
    for i in range(len(centroids)):
-        knn = indices[i]
+        if (centroids.iloc[i].name == -1) or (i in singletons):
-        knn_colors = color_assignments[knn]
+            color_assignments[i] = isolate_color
        available_colors = color_ids[list(set(color_ids) - set(knn_colors))]
        if(len(available_colors) > 0):
            color_assignments[i] = available_colors[0]
        else:
-            raise Exception("Can't color this many neighbors with this many colors")
+            knn = indices[i]
            knn_colors = color_assignments[knn]
            available_colors = color_ids[list(set(color_ids) - set(knn_colors))]
            if(len(available_colors) > 0):
                color_assignments[i] = available_colors[0]
            else:
                raise Exception("Can't color this many neighbors with this many colors")
    centroids = centroids.reset_index()
    colors = centroids.loc[:,['cluster']]
@@ -100,26 +148,40 @@ def assign_cluster_colors(tsne_data, clusters, n_colors, n_neighbors = 4):
    tsne_data = tsne_data.merge(colors,on='cluster')
    return(tsne_data)
-term_data = pd.read_feather("tsne_subreddit_fit.feather")
+def build_visualization(tsne_data, clusters, output):
 clusters = pd.read_feather("term_3000_clusters.feather")
-tsne_data = assign_cluster_colors(term_data,clusters,10,8)
+    # tsne_data = "/gscratch/comdata/output/reddit_tsne/subreddit_author_tf_similarities_10000.feather"
    # clusters = "/gscratch/comdata/output/reddit_clustering/subreddit_author_tf_similarities_10000.feather"
-term_zoom_plot = zoom_plot(tsne_data)
+    tsne_data = pd.read_feather(tsne_data)
    tsne_data = tsne_data.rename(columns={'_subreddit':'subreddit'})
    clusters = pd.read_feather(clusters)
-term_zoom_plot.save("subreddit_terms_tsne_3000.html")
+    tsne_data = assign_cluster_colors(tsne_data,clusters,10,8)
-term_viewport_plot = viewport_plot(tsne_data)
+    sr_per_cluster = tsne_data.groupby('cluster').subreddit.count().reset_index()
    sr_per_cluster = sr_per_cluster.rename(columns={'subreddit':'cluster_size'})
-term_viewport_plot.save("subreddit_terms_tsne_3000_viewport.html")
+    tsne_data = tsne_data.merge(sr_per_cluster,on='cluster')
-commenter_data = pd.read_feather("tsne_author_fit.feather")
+    term_zoom_plot = zoom_plot(tsne_data)
-clusters = pd.read_feather('author_3000_clusters.feather')
+
-commenter_data = assign_cluster_colors(commenter_data,clusters,10,8)
+    term_zoom_plot.save(output)
-commenter_zoom_plot = zoom_plot(commenter_data)
+
-commenter_viewport_plot = viewport_plot(commenter_data)
+    term_viewport_plot = viewport_plot(tsne_data)
-commenter_zoom_plot.save("subreddit_commenters_tsne_3000.html")
+
-commenter_viewport_plot.save("subreddit_commenters_tsne_3000_viewport.html")
+    term_viewport_plot.save(output.replace(".html","_viewport.html"))
 if __name__ == "__main__":
    fire.Fire(build_visualization)
 # commenter_data = pd.read_feather("tsne_author_fit.feather")
 # clusters = pd.read_feather('author_3000_clusters.feather')
 # commenter_data = assign_cluster_colors(commenter_data,clusters,10,8)
 # commenter_zoom_plot = zoom_plot(commenter_data)
 # commenter_viewport_plot = viewport_plot(commenter_data)
 # commenter_zoom_plot.save("subreddit_commenters_tsne_3000.html")
 # commenter_viewport_plot.save("subreddit_commenters_tsne_3000_viewport.html")
 # chart = chart.properties(width=10000,height=10000)
 # chart.save("test_tsne_whole.svg")
Author	SHA1	Message	Date
Nathan TeBlunthuis	930ee47d2b	refactor similarities to use submodule.	2022-01-19 15:05:49 -08:00
Nathan TeBlunthuis	98c1317af5	update pushshift dumps.	2021-12-10 21:23:32 -08:00
Nathan TeBlunthuis	541e125b28	lsi support for weekly similarities	2021-08-11 22:48:33 -07:00
Nathan TeBlunthuis	b7c39a3494	Merge branch 'master' of code:cdsc_reddit into excise_reindex	2021-08-03 15:13:39 -07:00
Nathan TeBlunthuis	ce549c6c97	Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex	2021-08-03 15:13:21 -07:00
Nathan TeBlunthuis	6e43294a41	Updates to similarities code for smap project.	2021-08-03 15:06:48 -07:00
Nathan TeBlunthuis	14ab979f59	Merge branch 'master' of code:cdsc_reddit	2021-08-03 15:03:40 -07:00
Nathan TeBlunthuis	2d21ff1137	Merge branch 'master' of code:cdsc_reddit into excise_reindex	2021-08-03 15:02:08 -07:00
Nate E TeBlunthuis	cf86c7492c	update clustering scripts	2021-08-03 14:55:02 -07:00
Nate E TeBlunthuis	c6122bb429	Merge branch 'master' of code:cdsc_reddit	2021-07-28 15:32:21 -07:00
Nate E TeBlunthuis	596e1ff339	no longer do we need to get daily dumps	2021-07-28 15:32:04 -07:00
Nate E TeBlunthuis	87ffaa6858	script for picking the best clustering given constraints	2021-05-14 19:10:36 -07:00
Nate E TeBlunthuis	7b14db67de	Merge branch 'excise_reindex' of code:cdsc_reddit into excise_reindex	2021-05-13 22:28:31 -07:00
Nate E TeBlunthuis	0b95bea30e	support isolates in visualization	2021-05-13 22:26:58 -07:00
Nate E TeBlunthuis	582cf263ea	bug fix in affinity clustering	2021-05-13 22:26:15 -07:00
Nate E TeBlunthuis	8a2248fae1	Merge remote-tracking branch 'origin/excise_reindex' into temp	2021-05-10 18:32:03 -07:00
Nate E TeBlunthuis	47ba04aa97	add script for pulling cluster timeseries	2021-05-10 18:24:22 -07:00
Nate E TeBlunthuis	4cb7eeec80	Refactor to make a decent api.	2021-05-10 13:46:49 -07:00
Nate E TeBlunthuis	f05cb962e0	refactor clustring in object oriented style	2021-05-07 22:33:26 -07:00
Nate E TeBlunthuis	8d1df5b26e	refactor clustering.py into method-specific files.	2021-05-03 11:28:48 -07:00
Nate E TeBlunthuis	e1c9d9af6f	Remove 'exclude phrases' parameter.	2021-05-03 10:37:09 -07:00
Nate E TeBlunthuis	7df8436067	Use Latent semantic indexing and hdbscan	2021-05-02 23:39:55 -07:00
Nate E TeBlunthuis	36b24ee933	reindex tfidf in memory instead of using spark	2021-04-30 12:48:19 -07:00
Nate E TeBlunthuis	6a3bfa26ee	bugfix	2021-04-26 22:31:05 -07:00
Nate E TeBlunthuis	3a758f1fc8	Merge branch 'charliepatch' of code:cdsc_reddit into charliepatch	2021-04-26 13:58:25 -07:00
Nate E TeBlunthuis	806cfc948f	support passing in list of tfidf vectors. Also lowercases included subreddits.	2021-04-26 13:20:43 -07:00
Nate E TeBlunthuis	0fe120e4ab	support passing in list of tfidf vectors. Also lowercases included subreddits.	2021-04-26 11:44:56 -07:00
Nate E TeBlunthuis	f20365c07e	Merge branch 'master' of code:cdsc_reddit	2021-04-22 10:46:26 -07:00
Nate E TeBlunthuis	34e0a0a30d	version of weekly_cosine_similarities.py from klone	2021-04-22 10:38:10 -07:00
Nate E TeBlunthuis	003a48aea5	bugfix in weekly similarities	2021-04-22 10:37:04 -07:00
Nate E TeBlunthuis	37dd0ef55f	bugfixes in clustering selection.	2021-04-21 16:56:25 -07:00
Nate E TeBlunthuis	ac06a8757a	calculate some user-level attributes to detect bots	2021-04-20 11:34:36 -07:00
Nate E TeBlunthuis	01a4c35358	grid sweep selection for clustering hyperparameters	2021-04-20 11:33:54 -07:00
Nate E TeBlunthuis	628a70734b	Merge branch 'master' of code:cdsc_reddit	2021-04-05 23:21:35 -07:00
Nate E TeBlunthuis	f0176d9f0d	Changes for cosine similarities on klone.	2021-04-05 23:21:06 -07:00
Nate E TeBlunthuis	a013f6718b	export timeseries functions	2021-03-24 17:18:30 -07:00
Nate E TeBlunthuis	36cb0a5546	add code for pulling activity time series from parquet.	2021-03-24 16:08:57 -07:00
Nate E TeBlunthuis	06430903f0	add included_subreddits parameter to cosine similarities.	2021-02-22 18:38:34 -08:00
Nate E TeBlunthuis	4dc949de5f	Changes from hyak.	2021-02-22 16:03:48 -08:00
Nate E TeBlunthuis	140d1bdd17	fix bug in viz.	2021-01-27 20:26:15 -08:00
Nate E TeBlunthuis	554660275f	add visualization for 10000 subreddits based on author-tf similarities.	2021-01-27 20:22:24 -08:00
Nate E TeBlunthuis	b4dd9acbd8	Merge branch 'master' of code:cdsc_reddit	2021-01-27 20:09:23 -08:00
Nathan TeBlunthuis	dbe4c87f8b	add cluster selection to visualization	2021-01-27 20:08:07 -08:00
Nate E TeBlunthuis	3155600514	remove nsfw subs from topN	2020-12-28 21:11:44 -08:00
Nate E TeBlunthuis	4e20dce188	Updating to support wang-style user overlaps.	2020-12-24 22:38:04 -08:00
Nate E TeBlunthuis	56269deee3	Some improvements to run affinity clustering on larger dataset and compute density.	2020-12-12 20:42:47 -08:00
Nate E TeBlunthuis	e6294b5b90	Refactor and reorganze.	2020-12-08 17:32:20 -08:00
Nate E TeBlunthuis	a60747292e	Add code for running tf-idf at the weekly level.	2020-12-01 22:54:48 -08:00
Nathan TeBlunthuis	db5879d6c9	refactor visualization code.	2020-11-17 16:46:49 -08:00
		`@@ -0,0 +1,2 @@`
							`from timeseries import load_clusters, load_densities, build_cluster_timeseries`
							`from cdsc_ecology_utils import similarity_functions`
		`@@ -0,0 +1,2 @@`
							`from .choose_clusters import load_clusters, load_densities`
							`from .cluster_timeseries import build_cluster_timeseries`