use static tfidf (not weekly) to create tfidf matrix
This commit is contained in:
parent
7b5ac73b2c
commit
f11d4cfc72
@ -75,17 +75,19 @@ def cosine_similarities_weekly_lsi(*args, n_components=100, lsi_model=None, **kw
|
|||||||
return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
|
return cosine_similarities_weekly(*args, simfunc=simfunc, **kwargs)
|
||||||
|
|
||||||
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet')
|
#tfidf = spark.read.parquet('/gscratch/comdata/users/nathante/competitive_exclusion_reddit/data/tfidf_weekly/comment_submission_terms_tfidf.parquet')
|
||||||
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None,max_df=None):
|
def cosine_similarities_weekly(tfidf_path, outfile, term_colname, included_subreddits = None, topN = None, simfunc=column_similarities, min_df=None, max_df=None, static_tfidf_path=None):
|
||||||
print(outfile)
|
print(outfile)
|
||||||
# do this step in parallel if we have the memory for it.
|
# do this step in parallel if we have the memory for it.
|
||||||
# should be doable with pool.map
|
# should be doable with pool.map
|
||||||
conn = duckdb.connect()
|
conn = duckdb.connect()
|
||||||
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
|
subreddit_names = conn.execute(f"SELECT DISTINCT subreddit, subreddit_id from read_parquet('{tfidf_path}/*/*.parquet') ORDER BY subreddit_id;").df()
|
||||||
|
|
||||||
|
if static_tfidf_path is not None:
|
||||||
|
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{static_tfidf_path}/*/*.parquet')").df()
|
||||||
|
else:
|
||||||
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
nterms = conn.execute(f"SELECT MAX({term_colname + '_id'}) as nterms FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
||||||
nterms = nterms.nterms.values
|
nterms = nterms.nterms.values
|
||||||
print(nterms)
|
nterms = int(nterms[0])
|
||||||
print(int(nterms[0]))
|
|
||||||
weeks = conn.execute(f"SELECT DISTINCT week FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
weeks = conn.execute(f"SELECT DISTINCT week FROM read_parquet('{tfidf_path}/*/*.parquet')").df()
|
||||||
weeks = weeks.week.values
|
weeks = weeks.week.values
|
||||||
conn.close()
|
conn.close()
|
||||||
|
Loading…
Reference in New Issue
Block a user