diff --git a/similarities/similarities_helper.py b/similarities/similarities_helper.py index a96bfb5..54e2d08 100644 --- a/similarities/similarities_helper.py +++ b/similarities/similarities_helper.py @@ -191,7 +191,9 @@ def write_weekly_similarities(path, sims, week, names, clusters=None): if clusters is not None: cluster_sims = duckdb.sql("SELECT sims.* FROM sims SEMI JOIN clusters ON _subreddit == sr_i AND variable == sr_j").df() - + else: + cluster_sims = sims + cluster_sims.to_parquet(p / week) def column_overlaps(mat): diff --git a/similarities/weekly_cosine_similarities.py b/similarities/weekly_cosine_similarities.py index 862ceb9..c64ab99 100755 --- a/similarities/weekly_cosine_similarities.py +++ b/similarities/weekly_cosine_similarities.py @@ -116,7 +116,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre -def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500, static_tfidf_path=None): +def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500, static_tfidf_path=None, clusters=None): return cosine_similarities_weekly(infile, outfile, 'author', @@ -125,10 +125,11 @@ def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/ topN, min_df=min_df, max_df=max_df, - static_tfidf_path=static_tfidf_path + static_tfidf_path=static_tfidf_path, + clusters=clusters ) -def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None, static_tfidf_path=None): +def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None, static_tfidf_path=None, clusters=None): return cosine_similarities_weekly(infile, outfile, 'term', @@ -136,10 +137,11 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re max_df, included_subreddits, topN, - static_tfidf_path=static_tfidf_path) + static_tfidf_path=static_tfidf_path, + clusters=clusters) -def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=2): +def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=2, clusters=None): return cosine_similarities_weekly_lsi(infile, outfile, 'author', @@ -147,18 +149,20 @@ def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/o n_components=n_components, lsi_model=lsi_model, static_tfidf_path=static_tfidf_path, - min_df=min_df + min_df=min_df, + clusters=clusters ) -def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None): +def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None,clusters=None): return cosine_similarities_weekly_lsi(infile, outfile, 'term', included_subreddits=included_subreddits, n_components=n_components, lsi_model=lsi_model, - static_tfidf_path=static_tfidf_path + static_tfidf_path=static_tfidf_path, + clusters=clusters ) if __name__ == "__main__":