1
0

pass clusters param through

This commit is contained in:
Nathan TeBlunthuis 2025-01-11 20:09:19 -08:00
parent dba0faf125
commit 4168d0d4cf
2 changed files with 15 additions and 9 deletions

View File

@ -191,7 +191,9 @@ def write_weekly_similarities(path, sims, week, names, clusters=None):
if clusters is not None:
cluster_sims = duckdb.sql("SELECT sims.* FROM sims SEMI JOIN clusters ON _subreddit == sr_i AND variable == sr_j").df()
else:
cluster_sims = sims
cluster_sims.to_parquet(p / week)
def column_overlaps(mat):

View File

@ -116,7 +116,7 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500, static_tfidf_path=None):
def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500, static_tfidf_path=None, clusters=None):
return cosine_similarities_weekly(infile,
outfile,
'author',
@ -125,10 +125,11 @@ def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/
topN,
min_df=min_df,
max_df=max_df,
static_tfidf_path=static_tfidf_path
static_tfidf_path=static_tfidf_path,
clusters=clusters
)
def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None, static_tfidf_path=None):
def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None, static_tfidf_path=None, clusters=None):
return cosine_similarities_weekly(infile,
outfile,
'term',
@ -136,10 +137,11 @@ def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/re
max_df,
included_subreddits,
topN,
static_tfidf_path=static_tfidf_path)
static_tfidf_path=static_tfidf_path,
clusters=clusters)
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=2):
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None, min_df=2, clusters=None):
return cosine_similarities_weekly_lsi(infile,
outfile,
'author',
@ -147,18 +149,20 @@ def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/o
n_components=n_components,
lsi_model=lsi_model,
static_tfidf_path=static_tfidf_path,
min_df=min_df
min_df=min_df,
clusters=clusters
)
def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None):
def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None,clusters=None):
return cosine_similarities_weekly_lsi(infile,
outfile,
'term',
included_subreddits=included_subreddits,
n_components=n_components,
lsi_model=lsi_model,
static_tfidf_path=static_tfidf_path
static_tfidf_path=static_tfidf_path,
clusters=clusters
)
if __name__ == "__main__":