use static tfidf (not weekly) to create tfidf matrix
This commit is contained in:
parent
e2e7d7dbb1
commit
7b5ac73b2c
@ -102,43 +102,47 @@ def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subre
|
|||||||
# with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
|
# with Pool(cpu_count()) as pool: # maybe it can be done with 40 cores on the huge machine?
|
||||||
|
|
||||||
|
|
||||||
def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500):
|
def author_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', min_df=2, max_df=None, included_subreddits=None, topN=500, static_tfidf_path=None):
|
||||||
return cosine_similarities_weekly(infile,
|
return cosine_similarities_weekly(infile,
|
||||||
outfile,
|
outfile,
|
||||||
'author',
|
'author',
|
||||||
max_df,
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN,
|
topN,
|
||||||
min_df=2
|
min_df=2,
|
||||||
|
static_tfidf_path=static_tfidf_path
|
||||||
)
|
)
|
||||||
|
|
||||||
def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None):
|
def term_cosine_similarities_weekly(outfile, infile='/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', min_df=None, max_df=None, included_subreddits=None, topN=None, static_tfidf_path=None):
|
||||||
return cosine_similarities_weekly(infile,
|
return cosine_similarities_weekly(infile,
|
||||||
outfile,
|
outfile,
|
||||||
'term',
|
'term',
|
||||||
min_df,
|
min_df,
|
||||||
max_df,
|
max_df,
|
||||||
included_subreddits,
|
included_subreddits,
|
||||||
topN)
|
topN,
|
||||||
|
static_tfidf_path=static_tfidf_path)
|
||||||
|
|
||||||
|
|
||||||
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None):
|
def author_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_authors_test.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None):
|
||||||
return cosine_similarities_weekly_lsi(infile,
|
return cosine_similarities_weekly_lsi(infile,
|
||||||
outfile,
|
outfile,
|
||||||
'author',
|
'author',
|
||||||
included_subreddits=included_subreddits,
|
included_subreddits=included_subreddits,
|
||||||
n_components=n_components,
|
n_components=n_components,
|
||||||
lsi_model=lsi_model
|
lsi_model=lsi_model,
|
||||||
|
static_tfidf_path=static_tfidf_path
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None):
|
def term_cosine_similarities_weekly_lsi(outfile, infile = '/gscratch/comdata/output/reddit_similarity/tfidf_weekly/comment_terms.parquet', included_subreddits=None, n_components=100,lsi_model=None,static_tfidf_path=None):
|
||||||
return cosine_similarities_weekly_lsi(infile,
|
return cosine_similarities_weekly_lsi(infile,
|
||||||
outfile,
|
outfile,
|
||||||
'term',
|
'term',
|
||||||
included_subreddits=included_subreddits,
|
included_subreddits=included_subreddits,
|
||||||
n_components=n_components,
|
n_components=n_components,
|
||||||
lsi_model=lsi_model,
|
lsi_model=lsi_model,
|
||||||
|
static_tfidf_path=static_tfidf_path
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user